In [3]:
from __future__ import absolute_import
import sys
sys.path.append('/home/yichun/projects/information_retrieval')
from compute_similarity.WordMoverDistance import WordMoverDistance
from compute_similarity.CosineSimilarity import CosineSimilarity
import fasttext
import pandas as pd
import json
from process import tokenizer

from datetime import datetime

with open("/home/yichun/projects/information_retrieval/data/stopwords_fr.json", "r") as infile:
    stopwords_fr = json.load(infile)
topics = {'A': 'Rupture abusive de la relation de travail',
          'B': 'Rupture abusive du contrat de travail',
          'C': 'Rupture brutale de relations commerciales établies',
          'D': 'Rupture brutale des contrats',
          'E': 'Indemnité compensatrice de rupture',
          'F': 'Indemnité compensatrice de congés payés',
          'G': 'Indemnité compensatrice de préavis'
          }
synonymes = ['licenciement', 'dommages-intérêts', "dommages et intérêts", "dommages", "intérêts",
             "allocation", "compensation", "dédommagement", "indemnisation", "paiement", "pécule", "prestation",
               "départ", "expulsion",  "mise à la porte", "mise au chômage", "renvoi"
             , "illégitime", "injuste", "injustifié", "bail", "brusque"]

In [4]:
def get_topicId2text():
    """
    :return: topicId2text where text is  lowercase and tokenized
    """
    topicId2text = {}
    for topic in topics:
        tokens = topics[topic].split()
        tokens = [t.lower() for t in tokens if t not in stopwords_fr]
        topicId2text[topic] = tokens
    return topicId2text

In [5]:
def get_top_infos(sims, method):
    """
    Return top1topic, diff_top1topic_top2topic
    if method is 'wmd', top1topic is the topic has the min distance
    if mthod is 'cosine', top1topic is the topic has the max similarity score
    :param {float[]} sims: list of similarities/dissimilarities
    :param {string} method: method for mesuring similarities
    :return: top1topic, diff_top1topic_top2topic
    """
    t = [k for k, v in topics.items()]
    if method == 'wmd':
        top1topic_index, top2topic_index = sorted(range(len(sims)), key=lambda i: sims[i])[:2]
    elif method == 'cosine':
        top1topic_index, top2topic_index = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:2]
    diff_top1sim_top2sim = abs(sims[top1topic_index] - sims[top2topic_index])
    top1topic = t[top1topic_index]
    return top1topic, diff_top1sim_top2sim

In [6]:
def create_similarities_columns(method):
    """
    Read file which contains similarities computed and create a column
    """
    file = "/home/yichun/projects/information_retrieval/compute_similarity/results/scores_{}".format(method)
    df = pd.read_csv(file, sep='\t', header=None)
    df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: [float(e) for e in x.split(' ')])
    #df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: [round(float(e), 2) for e in x.split(' ')])
    return df.iloc[:, 1]

In [7]:
def store_similarities(judgements, method, vectors):
    """
    It takes sometimes lots of times for calculating similarities, i.e wmd. So we store it in files
    :param judgements:
    :param similarity_calculater:
    :param method:
    :param vectorizer:
    :param topicId2text:
    :return:
    """
    texts = judgements['text']
    start = datetime.now()
    for i in range(len(texts)):
        try:
            scores = compute_similarity_with_topics(texts[i], method, vectors)
            scores_str = ' '.join([str(s) for s in scores])
            with open("/home/yichun/projects/information_retrieval/compute_similarity/results/scores_{}".format(method), 'a') as f:
                f.write("{}\t{}\n".format(i+1, scores_str))
        except Exception:
            pass
            with open("/home/yichun/projects/information_retrieval/compute_similarity/results/scores_{}".format(method), 'a') as f:
                f.write("{}\t{}\n".format(i+1, "error"))
        fin = datetime.now()
    #print("executing time = ", fin-start)

In [8]:
def compute_similarity_with_topics(sent, method, vectors):
    """
    Return similarities bewteen sentence and topics
    """
    vectorizer = fasttext.load_model(vectors)
    topicId2text = get_topicId2text()
    if method == "wmd":
        similarity_calculator = WordMoverDistance()
    elif method == "cosine":
        similarity_calculator = CosineSimilarity()
    tokens = tokenizer(sent.lower())
    tokens = [t.lower() for t in tokens if t not in stopwords_fr]
    scores = [similarity_calculator.get_similarity_between_two_sentences(tokens, topicId2text[topic], vectorizer) for topic in topicId2text]
    return scores

In [15]:
def compute_similarities(judgements, method=None, vectors=None):
    """
    :param {dataframe} judgements:
    :param {string} method: method to compute similarity bewteen sentences
    :param vectors: vectors bin file
    :return:
    """

    #store_similarities(judgements, method, vectors) #uncomment this line for storing similarities
    judgements['sims'] = create_similarities_columns(method)
    judgements[['sim_a','sim_b', 'sim_c', 'sim_d', 'sim_e', 'sim_f', 'sim_g']] = pd.DataFrame(judgements.sims.values.tolist(), index= judgements.index)
    judgements['topic_similarity'], judgements[('diff_top1sim_top2sim')] = zip(*judgements.apply(lambda row: get_top_infos(row['sims'], method), axis=1))
    return judgements

In [16]:
judgements = pd.read_csv('/home/yichun/projects/information_retrieval/data/judgements')
compute_similarities(judgements[:5], method='cosine', vectors='/home/yichun/projects/information_retrieval/word_embedding/fasttext_model_50d.bin')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,judgementId,text,sims,sim_a,sim_b,sim_c,sim_d,sim_e,sim_f,sim_g,topic_similarity,diff_top1sim_top2sim
0,1,par mise à disposition au greffe déclare l'ass...,"[0.24780075252056122, 0.22206394374370575, -0....",0.247801,0.222064,-0.018451,0.071051,0.256975,0.222961,0.225813,E,0.009174
1,2,annule les fermes rappels des 4102010 23122010...,"[0.45778682827949524, 0.46336841583251953, 0.1...",0.457787,0.463368,0.168021,0.448591,0.730311,0.723619,0.740945,G,0.010634
2,3,déclare recevable l'appel formé par la caisse ...,"[0.2490444779396057, 0.24885991215705872, 0.22...",0.249044,0.24886,0.223531,0.215498,0.225614,0.120425,0.078748,A,0.000185
3,4,statuant par arrêt réputé contradictoire et en...,"[0.1129513755440712, 0.1519133448600769, -0.16...",0.112951,0.151913,-0.168179,-0.004682,0.159331,0.149859,0.180489,G,0.021158
4,5,après en avoir délibéré conformément à la loi ...,"[0.3003010153770447, 0.29541015625, 0.00376743...",0.300301,0.29541,0.003767,0.187835,0.492992,0.520812,0.542429,G,0.021617
