# Sentiment Analysis using ML techniques
In this notebook we try to classify IMDB movie reviews as happy or not using ML. In more details
we clean the reviews and then vectorize them using TF-IDF weight and then we use Baggings SVMs for classification.
We evaluate our classifier using K-Fold Cross Validation, and then use it to predict the sentiment of
actual IMDB reviews. 
### In the end accomplishes Accuracy up to  0.86200

In [None]:
import pandas as pd
import time
import pickle

train_path = "files/data/train.csv"
test_path = "files/data/test_without_labels.csv"
x_vectors_path = "files/serialized/vectors"
test_vectors_path = "files/serialized/tets_vectors"
predictions_path = "files/data/predictions.csv"


## Pickle Store and load
Used in order to store and load the produced vector

In [1]:
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

### Loading Data from Local store

In [12]:
train = pd.read_csv(train_path)

X = train['Content']
y = train['Label']

test = pd.read_csv(test_path)
X_test = test['Content']

## Pre-process using Lemmatization

Applying Lemmatization using position tags. We use position tags in order to enable lemmatization, 
not only to nouns but also to all other parts of speech. Also removing stopwords, punctuations and non alpha characters.

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess

lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                clean_doc.append(lmtzr.lemmatize(word, tag))
            else:
                clean_doc.append(word)
        new_documents.append(clean_doc)
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents

## Text pre-process and Vectorization
We use the hashing trick and then tf-idf transformer in order to convert words 
frequencies into TF-IDF values. 

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 

vectorizer = HashingVectorizer(n_features=100000, lowercase=False, tokenizer=lambda x: x)

def tfidf_vectorization(documents):
    starting_tm = time.time()
    vectors = vectorizer.fit_transform(documents)
    vectors = TfidfTransformer().fit_transform(vectors)
    print("Vectorization time: " + str((time.time() - starting_tm)))
    return vectors


## Evaluation using 5-Fold Cross Validation

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

def evaluation(clf, clf_name, X, y, k=5):
    starting_tm = time.time()
    clf_precision = 0
    clf_recall = 0
    clf_f1 = 0
    clf_accuracy = 0
    
    skf = StratifiedKFold(n_splits=k)
    for train_index, test_index in skf.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        clf_precision += metrics.precision_score(y_test, predictions, average='micro')
        clf_recall += metrics.recall_score(y_test, predictions, average='micro')
        clf_f1 += metrics.f1_score(y_test, predictions, average='micro')
        clf_accuracy += metrics.accuracy_score(y_test, predictions)
    
     # compute the average of each value
    precision_score = clf_precision/k
    recall_score = clf_recall/k
    f1_score = clf_f1/k
    accuracy_score = clf_accuracy/k
    
    print(clf_name + "\nPrecision: " + str(precision_score)
          + "\nRecall: " + str(recall_score)
          + "\nF1-Measure: " + str(f1_score) 
          + "\nAccuracy: " + str(accuracy_score)
          + "\nExecution time: " + str(time.time() - starting_tm))

## Pre-process and vectorization

In [None]:
X = documents_preprocess(X)
X_vectors = tfidf_vectorization(X)

pickle_store(X_vectors, x_vectors_path)

In [17]:
X_vectors = pickle_load(x_vectors_path)

In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn import svm

n = 8
svm_clf = svm.SVC(gamma=0.1, C=10, kernel='rbf')
bagging_clf = BaggingClassifier(svm_clf, n_estimators=n, max_samples=1/n, n_jobs=n)

## Evaluation

In [None]:
evaluation(bagging_clf, "Baggings SVM",  X_vectors, y)

## Testing set Pre-process and Vectorization

In [None]:
X_test = documents_preprocess(X_test)
X_test_vectors = tfidf_vectorization(X_test)

## Predicting and storing the results as CSV

In [None]:
bagging_clf.fit(X_vectors, y)
predictions = bagging_clf.predict(X_test_vectors)

In [16]:
predictions_df = pd.DataFrame(data={'Id': list(test['Id']), 'Predicted':predictions})
predictions_df.to_csv(predictions_path, index=False)
