In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.utils import simple_preprocess

import spacy

import re
import json
from html.parser import HTMLParser
from io import StringIO

In [None]:
train_data = pd.read_json('../input/dataset_fr_train.json', lines=True)

test_data = pd.read_json('../input/dataset_fr_test.json', lines=True)

valid_data= pd.read_json('../input/dataset_fr_dev.json', lines=True)

train_data.head()

In [None]:
# convert starts into sentiments

sentiments_dict = {1: 0,
            2: 0,
            3: 1,
            4: 2,
            5: 2}

def stars_to_sentiment(dataset):
    dataset['sentiment'] = dataset['stars'].map(sentiments_dict)
    dataset = dataset[['review_body','sentiment']]
    return dataset

In [None]:
df_train = stars_to_sentiment(train_data)
df_test = stars_to_sentiment(test_data)
df_valid = stars_to_sentiment(valid_data)

In [None]:
class MLStripper(HTMLParser):
        def __init__(self):
            super().__init__()
            self.reset()
            self.strict = False
            self.convert_charrefs= True
            self.text = StringIO()
        def handle_data(self, d):
            self.text.write(d)
        def get_data(self):
            return self.text.getvalue()
        
def html_free_text(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def url_free_text(text):
    text = re.sub(r'(?:\@|https?\://)\S+', '', text)
    return text
    
with open('../input/abrivot_fr.json', encoding='utf-8') as f:
        abrivot = json.load(f)   
        
def abrivot_free_text(text):
    words = text.lower().split()
    text_out = [abrivot[word] if word in abrivot else word for word in words]
    return ' '.join(text_out)

def punct_free_text(text):
    text_out = simple_preprocess(text, deacc=True, min_len=3)
    return ' '.join(text_out)

In [None]:
def clean_data(data):
    data['text_free_html'] = data['review_body'].apply(lambda x: html_free_text(str(x)))
    data['text_free_url'] = data['text_free_html'].apply(url_free_text)
    data['text_free_abrivot'] = data['text_free_url'].apply(abrivot_free_text)
    data['text_review'] = data['text_free_abrivot'].apply(punct_free_text)
    
    data = data[['text_review','sentiment']]
    return data

In [None]:
%%time

df_train_clean = clean_data(df_train)
df_test_clean = clean_data(df_test)
df_valid_clean = clean_data(df_valid)

In [None]:
#nlp = spacy.load('fr_core_news_lg', disable=['parser', 'ner'])

import fr_core_news_lg
nlp = fr_core_news_lg.load(disable=['parser', 'ner'])

In [None]:
with open('../input/fr_stopwords.txt', encoding='utf-8') as f:
    fr_stopwords = f.read().splitlines()
stop_words = nlp.Defaults.stop_words.union(fr_stopwords)

In [None]:
# final preprocesser
def process_words(texts, stop_words=stop_words):
    
    """Convert a document into a list of lowercase tokens, build bigrams-trigrams, implement lemmatization"""
    
    # remove stopwords, short tokens and letter accents 
    #texts = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc in texts]
    texts = [[word for word in doc.split() if word not in stop_words] for doc in texts]

    
    # implement lemmatization and filter out unwanted part of speech tags

    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    
    # remove stopwords and short tokens again after lemmatization
    texts_out = [' '.join(
                [word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words]) for doc in texts_out]    
    
    return texts_out

In [None]:
def process_data(data):
    
    data['review_processed'] = process_words(data['text_review'])
    data = data[['review_processed','sentiment']]
    return data

In [None]:
%%time
df_train_proc = process_data(df_train_clean)
df_test_proc = process_data(df_test_clean)
df_valid_proc = process_data(df_valid_clean)

# Analyse des sentiments

In [None]:
# save for further use

df_train_proc.to_pickle('df_train_proc.pkl')
df_test_proc.to_pickle('df_test_proc.pkl')
df_valid_proc.to_pickle('df_valid_proc.pkl')

### Experimenting

In [None]:
negatives = df_train_proc.loc[df_train_proc['sentiment'] == 0, 'review_processed'][:8]
positives = df_train_proc.loc[df_train_proc['sentiment'] == 2, 'review_processed'][:8]

In [None]:
corpus = negatives.values.tolist() + positives.values.tolist()
corpus

In [None]:
sentiments = array([0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1])

## RNN

In [1]:
import fasttext.util

ModuleNotFoundError: No module named 'fasttext'

In [None]:
fasttext.util.download_model('fr', if_exists='ignore')

In [None]:
ft = fasttext.load_model('cc.fr.300.bin')