# Lab 2: Sentiment Classification with SVM

In [None]:
import numpy as np
from numpy.random import choice
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from tqdm.notebook import tqdm

## Importing data

In [None]:
df_train = pd.read_csv("data/lab_train.txt", engine="python")
df_train.drop(columns="Unnamed: 0", inplace=True)
df_train.head() # 173 positive evaluations

In [None]:
df_test = pd.read_csv("data/lab_test.txt", engine="python")
df_test.drop(columns="Unnamed: 0", inplace=True)
df_test.head() # 88 positive evaluations

In [None]:
df_eval = pd.read_excel('data/evaluation_dataset.xlsx', header=None, names=['review'])
df_eval.head()

In [None]:
train_reviews = df_train.review.values
test_reviews = df_test.review.values

## Cleaning Data
### Using the Natural Language Processing Toolkit package

In [None]:
stop_words = stopwords.words('english')

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def clean_review(tokens, stop_words = stop_words, numbers=True):

    cleaned_tokens = []
        
    for token, tag in pos_tag(tokens):
        # Removing links
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # Removing @
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
            # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in cleaned_tokens if number in w]
        # removing selected tokens
        cleaned_tokens = [w for w in cleaned_tokens if not w in kill_list]
        
        # merge tokens
        merged = ' '
        merged = merged.join(cleaned_tokens)
        
    return merged

def clean(array):
    for i, phrase in enumerate(array):
        array[i] = clean_review(word_tokenize(phrase))
    return array

In [None]:
train_reviews = clean(train_reviews)
test_reviews = clean(test_reviews)

## Feature Extraction 
### Using the scikit-learn librairy

In [None]:
vectorizer = TfidfVectorizer(min_df=3,
                             stop_words=stopwords.words('english'),
                             strip_accents='ascii')

X_train = vectorizer.fit_transform(train_reviews).toarray()
X_test = vectorizer.transform(test_reviews).toarray()

X_train.shape

In [None]:
def targets(score, thresh=2.5):
    targets = np.ones(score.shape, dtype=np.int)
    targets = targets - 2*((score<thresh).astype(dtype=np.int))
    return targets

In [None]:
Y_train = targets(df_train.score.values)
Y_test = targets(df_test.score.values)

## Training the model
### Since we faced the problem of having only positive predictions when we were using all the data, we chose to limit the influence of the positively labeled data by training the model on a subset of data containing as much positively than negatively labeled data.
### This is why there is a for loop, we are trying different subset to train models and keeping only the best model ie the one that yielded the highest f1-score. This scores takes into account the good classification of both 'negative' and 'positive' classes. Not only the accuracy. This is why we get at best 84% accuracy (which is lower than 88% if we would predict only positive values) but this model predicts both good 'negative' and 'positive' labels (see recall score below).

In [None]:
classifier = svm.SVC(kernel='rbf')
classifier.fit(X_train, Y_train)
classifier.predict(X_test)

In [None]:
# Trying to reduce influence of positive classification
neg = Y_train == -1
pos = Y_train == 1

classifiers=[]
scores = []
accuracy = []
for k in tqdm(range(5000)):
    idxs = choice(173, 27, replace=False)
    
    x_array = [X_train[neg]]+[X_train[idxs]]
    y_array = [Y_train[neg]]+[Y_train[idxs]]
    
    new_x_train = np.concatenate(x_array)
    new_y_train = np.concatenate(y_array)
    
    classifier = svm.SVC(kernel='rbf')
    classifier.fit(new_x_train, new_y_train)
    Y_pred = classifier.predict(X_test)
    
#     X_eval = vectorizer.transform(df_eval.review.values)
#     score = classifier.predict(X_eval)
    
    classifiers.append(classifier)
    scores.append(f1_score(Y_test,Y_pred))
    accuracy.append(np.mean(Y_pred==Y_test))
    
print('F1-score of the best model', scores[scores.index(max(scores))])
print('Accuracy score of the best model', accuracy[scores.index(max(scores))])

In [None]:
# Taking the best model
classifier = classifiers[scores.index(max(scores))]
Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred))

### Evaluation

In [None]:
X_eval = vectorizer.transform(clean(df_eval.review.values)).toarray()
pred_eval = classifier.predict(X_eval)
print(str(np.sum([pred_eval==1]))+ ' positive ratings')

In [None]:
predictions = np.zeros(pred_eval.shape, dtype='O')
predictions[pred_eval==1] = 'positive'
predictions[pred_eval==-1] = 'negative'

In [None]:
df_eval = pd.read_excel('data/evaluation_dataset.xlsx', header=None, names=['review'])
df_eval.insert(1, "prediction", predictions, True)
df_eval.head()

In [None]:
df_eval.to_csv('data/evaluation_cleaning.csv', index=False)

In [None]:
df_test = pd.read_csv('data/test.csv', engine='python')
X = (df_test.score.values).astype(np.int)[:100]
X[X==0]=-1
pred = classifier.predict(vectorizer.transform(clean(df_test.review.values)).toarray()[:100])
print(classification_report(X, pred))