In [1]:
import fastText
import pandas as pd
from sklearn.model_selection import train_test_split
import csv
import re
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
df = pd.read_csv('data/yelp/yelp_review.csv')

In [15]:
df = df[['stars', 'text']]

In [17]:
df['stars'] = df['stars'].apply(lambda x: '__label__'+str(x))

In [18]:
df.head()

Unnamed: 0,stars,text
0,__label__5,Super simple place but amazing nonetheless. It...
1,__label__5,Small unassuming place that changes their menu...
2,__label__5,Lester's is located in a beautiful neighborhoo...
3,__label__4,Love coming here. Yes the place always needs t...
4,__label__4,Had their chocolate almond croissant and it wa...


In [19]:
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    """
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', ' zero ')
    s = s.replace('1', ' one ')
    s = s.replace('2', ' two ')
    s = s.replace('3', ' three ')
    s = s.replace('4', ' four ')
    s = s.replace('5', ' five ')
    s = s.replace('6', ' six ')
    s = s.replace('7', ' seven ')
    s = s.replace('8', ' eight ')
    s = s.replace('9', ' nine ')
    return s

In [20]:
# Convert all upper case to lower case and separate out the punctuations
df['text'] = df['text'].apply(normalize)

### Split the whole dataset into training and validation. Then you will need to save the file generated. fastText uses the c++ api of the internal so that it is fasttext and it is able to read from a file only. Hence you cannot pass pandas dataframe or other general objects into fastText directly. It needs to be saved in a text file format. Acceptable separators: space '\s' and tabs '\t'

In [None]:
train, test = train_test_split(df, test_size=0.2)
train_file = 'data/yelp/nb_train.csv'
test_file = 'data/yelp/nb_test.csv'
train.to_csv("data/yelp/nb_train.csv", sep="\t", quoting=csv.QUOTE_NONE, index=False, header=False)
test.to_csv("data/yelp/nb_test.csv", sep="\t", quoting=csv.QUOTE_NONE, index=False, header=False)


# Model training

In [None]:
nb_model = fastText.train_supervised(input=train_file, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1)b

In [None]:
nb_model

In [None]:
nb_model.save_model('model/yelp/nb_model.bin')

# Prediction

In [None]:
nb_model.get_word_vector('restaurant')

In [None]:
nb_model.get_sentence_vector('I love this restaurant')

In [None]:
nb_model.predict('I love this restaurant')

In [None]:
nb_model.predict('I love this restaurant', k=3)

# Model Performance

# Pricision and recall for single label

In [None]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [None]:
print_results(*su_model.test(test_file))

In [None]:
n, p, r = su_model.test(test_file, 5)
print("N\t" + str(n))
print("P@{}\t{:.3f}".format(5, p))
print("R@{}\t{:.3f}".format(5, r))

In [None]:
def get_true_postive_count(cm, model, df, label_column_name, text_column_name, target_class):
    predictions = [model.predict(x)[0][0] for x in df[df[label_column_name] == target_class][text_column_name]]
    tp_count = len(list(filter(lambda x: x==target_class, predictions)))
    index = np.where(cm==tp_count)
    if index[0][0] == index[1][0]:
        return index[0][0]
    else:
        raise ValueError('COuld not find the index ' + index)
        
        
def resolve_labels_sequence(classes, cm, model, df, label_column_name, text_column_name):
    target_seq = [0] * len(classes)
    for label in classes:
        index = get_true_postive_count(cm, model, df, label_column_name, text_column_name, label)
        target_seq[index] = label
    return target_seq


def fasttext_confusion_matrix(model, pd_test_data, label_column_name, text_column_name):
    test_labels = pd_test_data[label_column_name]
    test_labels = np.array(test_labels)
    _classes = list(set(test_labels))
    pred_labels = [model.predict(x)[0][0] for x in pd_test_data[text_column_name]]
    pred_labels = np.array(pred_labels)
    eq = test_labels == pred_labels
    print("Accuracy: " + str(eq.sum() / len(test_labels)))
    cm = confusion_matrix(test_labels, pred_labels)
    labels = resolve_labels_sequence(_classes, cm, model, pd_test_data, label_column_name, text_column_name)
    print(labels)
    print(confusion_matrix(test_labels, pred_labels, labels=labels))
    df_cm = pd.DataFrame(cm, index = labels, columns = labels)
    plt.figure(figsize = (10,7))
    sn.heatmap(df_cm, annot=True)

In [None]:
fasttext_confusion_matrix(su_model, test, 'Score', 'Text')