# Allociné reviews ranking

In [None]:
!pip install dateparser
!pip install word2vec

In [None]:
# If you want to use Google Colab
import os
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("gdrive/My Drive/nlpENSAE/")

In [4]:
import numpy as np
import pandas as pd
import dateparser

In [5]:
films = pd.read_csv('film_data.csv')
reviews = pd.read_csv('film_reviews.csv')

In [6]:
films.actors = films.actors.str.translate({ord("'"): None})
reviews.comment = reviews.comment.str.lower()

In [7]:
films.date = films.date.apply(lambda x: dateparser.parse(x).date())
reviews.date = reviews.date.apply(lambda x: dateparser.parse(x).date())

In [8]:
reviews.dropna(inplace=True)

In [9]:
reviews = reviews.merge(films[['film_id', 'date']], on="film_id")

In [10]:
reviews['to_drop'] = abs(reviews.date_x - reviews.date_y).dt.days > 365/2

In [11]:
reviews = reviews[~reviews.to_drop]

In [12]:
def to_list(L):
  return L[1:-2].split(', ')

In [13]:
def replace_names(film_id, comment):
  actors = films[films.film_id == film_id]['actors'].iloc[0]
  author = films[films.film_id == film_id]['author'].iloc[0]
  for actor in to_list(actors):
    comment = comment.lower().replace(actor.lower(), '[actor]')
  comment = comment.lower().replace(author.lower(), '[author]')
  return comment

In [14]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('french')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# add the stopword "les" in the list of stopwords
stop.append('les')
punctuations = ["#", "$", "%", "&", "'", "\\", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "\[", "\]", "^", "_", "`", "{", "\"", "|", "}", "]", "!"]
reviews['comment'] = reviews['comment'].apply(lambda row: "".join(x if x not in punctuations else " " for x in row))
reviews['comment'] = reviews['comment'].apply(lambda row: " ".join(x for x in row.split() if row not in stop))
reviews['comment'] = reviews.apply(lambda row: replace_names(row['film_id'], row['comment']), axis=1)

## Naive method

In [16]:
reviews['n_words'] = reviews.comment.apply(lambda x: len(x.split(' ')))

## Word2Vec

In [17]:
import gensim
import word2vec

In [18]:
model = gensim.models.Word2Vec.load("fr.bin")

In [19]:
# library for tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
# create a column for tokenized sentences
reviews['tokenized_sents'] = reviews.apply(lambda row: nltk.word_tokenize(row['comment']), axis=1)

In [21]:
# create two new columns
# first one for the vector of the sentences, i.e. the sum or mean of words'vector
# second one to count the number of words which are not in the vocab of the pre-trained Word2Vec
reviews["vectorised_sents"] = None
reviews["notInVocab"] = 0
for index, row in reviews.iterrows():
    sum_vector = np.zeros(300)
    notInVocab = 0
    for word in row["tokenized_sents"]:
        try:
            sum_vector += model[word]
        except:
            notInVocab += 1
    mean_vector = sum_vector / (len(row["tokenized_sents"]) - notInVocab)
    reviews.at[index, "vectorised_sents"] = mean_vector
    reviews.at[index, "notInVocab"] = notInVocab

  # This is added back by InteractiveShellApp.init_path()
  


In [22]:
# compute a dataframe to get one column for each coefficient of the vectorised sentences
X = pd.DataFrame(np.column_stack(list(zip(*reviews.vectorised_sents))), columns=range(300))

In [23]:
y = reviews[['rank', 'film_id', 'n_words']]

In [24]:
X.shape, y.shape

((9522, 300), (9522, 3))

In [25]:
y = y[(X.isna().sum(axis=1) == 0).values]

In [26]:
X = X[X.isna().sum(axis=1) == 0]

In [27]:
X.shape, y.shape

((9520, 300), (9520, 3))

## Train test split

In [28]:
# function to split specifically the data into train and test 
def film_train_test_split(X, y, min_test_size=.25, random_state=42):
  np.random.seed(random_state)
  y = y.copy(deep=True)
  films_train = np.unique(y.film_id)
  weight = 0
  films_test = []
  while weight < min_test_size:
    selected = np.random.randint(len(films_train))
    films_test = np.append(films_test, films_train[selected])
    films_train = np.delete(films_train,selected)
    weight = len(y[y.film_id.isin(films_test)])/len(y)
  return(X[y.film_id.isin(films_train).values], X[y.film_id.isin(films_test).values], y[y.film_id.isin(films_train).values], y[y.film_id.isin(films_test).values])  

In [29]:
X_train, X_test, y_train, y_test = film_train_test_split(X, y)

## Pairwise

In [30]:
import itertools

In [31]:
def build_pairwise(X, y):
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    comb = itertools.combinations(range(X.shape[0]), 2)
    for k, (i, j) in  enumerate(comb):
        if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
            # skip if same rank or different film
            continue
        X_new.append(np.concatenate([X[i], X[j]]))
        y_new.append(int(y[i, 0] < y[j, 0]))
    return np.asarray(X_new), np.asarray(y_new).ravel()

In [32]:
X_train_pair, y_train_pair = build_pairwise(X_train.values, y_train.values)

In [33]:
X_test_pair, y_test_pair = build_pairwise(X_test.values, y_test.values)

## RankNet

In [34]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Subtract, Activation
from tensorflow.keras import backend
import tensorflow as tf

In [35]:
INPUT_DIM = 300
BATCH_SIZE = 4096

In [36]:
# Inputs
inputs_good = Input(shape=(INPUT_DIM, ))
inputs_bad = Input(shape=(INPUT_DIM, ))

# Model
h1 = Dense(2048, activation='elu')
dh1 = Dropout(0.25)
h2 = Dense(1024, activation='elu')
dh2 = Dropout(0.25)
h3 = Dense(512, activation='elu')
s = Dense(1)

# Computation scores
h1_good = h1(inputs_good)
dh1_good = dh1(h1_good)
h2_good = h2(dh1_good)
dh2_good = dh2(h2_good)
h3_good = h3(dh2_good)
s_good = s(h2_good)

h1_bad = h1(inputs_bad)
dh1_bad = dh1(h1_bad)
h2_bad = h2(dh1_bad)
dh2_bad = dh2(h2_bad)
h3_bad = h3(dh2_bad)
s_bad = s(h2_bad)

# Difference
diff = Subtract()([s_good, s_bad])

# Compute proba
proba = Activation('sigmoid')(diff)

In [37]:
ranknet = Model(inputs=[inputs_good, inputs_bad], outputs=proba)
get_score = backend.function([inputs_good], [s_good])
ranknet.compile(optimizer='adam', loss='binary_crossentropy')

In [38]:
#history = ranknet.fit([X_train_RankNet_1, X_train_RankNet_2], y_train_RankNet, validation_data=([X_test_RankNet_1, X_test_RankNet_2], y_test_RankNet), batch_size=BATCH_SIZE, epochs=3, verbose=1)
history = ranknet.fit([X_train_pair[:,0:int(X_train_pair.shape[1]/2)], \
                       X_train_pair[:,int(X_train_pair.shape[1]/2):X_train_pair.shape[0]]], y_train_pair, \
                      validation_data=([X_test_pair[:,0:int(X_test_pair.shape[1]/2)], \
                                        X_test_pair[:,int(X_test_pair.shape[1]/2):X_test_pair.shape[0]]],\
                                       y_test_pair),
                      batch_size=BATCH_SIZE, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [39]:
y_test['score'] = get_score([X_test.values])[0].ravel()

## Score

In [40]:
from sklearn.metrics import ndcg_score

In [42]:
n_words = []
score_ranknet = []

for i in y_test.film_id.unique():
  tmp = y_test[y_test.film_id == i]
  n_words.append(ndcg_score([tmp['rank']], [tmp['n_words']]))
  score_ranknet.append(ndcg_score([tmp['rank']], [tmp['score']]))

In [43]:
print(f'Naive estimator NDCG: {np.mean(n_words)} ({np.std(n_words)})')
print(f'RankNet NDCG: {np.mean(score_ranknet)} ({np.std(score_ranknet)})')

Naive estimator NDCG: 0.8021663280405044 (0.04740876632676116)
RankNet NDCG: 0.8345850851901646 (0.04858562306441741)
