# Import libraries

In [1]:
import re
import pandas as pd
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
pd.set_option('display.max_colwidth', -1)

## Explore data set
- Review of Rotten Tomatoes movies
- Train set : 156060 Phrase , 8544 Sentence , 5 unique sentiment values (0,1,2,3,4) , No null value
> - 0 stands for very negative
> - 1 stands for somehow negative
> - 2 stands for neutral
> - 3 stands for somehow positive
> - 4 stands for very positive
- Test set (30%) : 66292 Phrase , 3310 Sentence , No null value
- SampleSubmission : All sentiment is 2
- What makes more challenging : negation , sacarsm , terseness , language ambiguity etc

In [2]:
sentiment = pd.read_csv('train.tsv',sep='\t')

In [3]:
sentiment.shape

(156060, 4)

In [4]:
sentiment['Sentiment'].unique()

array([1, 2, 3, 4, 0], dtype=int64)

## Cleaning data

In [5]:
def clean_review(review):
    review=re.sub('[^a-zA-Z]',' ',review)
    return review.lower()

In [6]:
sentiment['clean_review'] = sentiment['Phrase'].apply(clean_review)

## Resampling data

### Why do we need resample data ?
- This is because if one class has has much values compare to others , model can be overfitted (it trys to fit this class most)
- In fact here is situation of 5 classes :
> - 2_sentiment phrase number is 79582
> - 1_sentiment phrase number is 27273
> - 3_sentiment phrase number is 32927
> - 0_sentiment phrase number is 7072
> - 4_sentiment phrase number is 9206
- As we can see , 2 - labeled - sentiment take account of more than 50%. That means label that model predicts tend to be neutral
- For less - rows label , resample allows repetition to balance row numbers. Here we set 75000 

In [7]:
train_2 = sentiment[sentiment['Sentiment']==2]
train_1 = sentiment[sentiment['Sentiment']==1]
train_3 = sentiment[sentiment['Sentiment']==3]
train_4 = sentiment[sentiment['Sentiment']==4]
train_5 = sentiment[sentiment['Sentiment']==0]
train_2_sample = resample(train_2,replace=True,n_samples=75000,random_state=123)
train_1_sample = resample(train_1,replace=True,n_samples=75000,random_state=123)
train_3_sample = resample(train_3,replace=True,n_samples=75000,random_state=123)
train_4_sample = resample(train_4,replace=True,n_samples=75000,random_state=123)
train_5_sample = resample(train_5,replace=True,n_samples=75000,random_state=123)

df_upsampled = pd.concat([train_2, train_1_sample,train_3_sample,train_4_sample,train_5_sample])

In [8]:
df_upsampled.shape

(379582, 5)

## Clean test

In [9]:
test = pd.read_csv('test.tsv',sep='\t')
test['clean_review'] = test['Phrase'].apply(clean_review)

### TF-IDF Vetorization

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit
vectorizer.fit(df_upsampled['clean_review'].values)

# Transform
df_upsampled_vectorized = vectorizer.transform(df_upsampled['clean_review'])
test_vectorized = vectorizer.transform(test['clean_review'])

In [11]:
y = df_upsampled['Sentiment']

## Training model

In [12]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [13]:
%%time
ovr.fit(df_upsampled_vectorized, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Wall time: 1min 6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [14]:
scores = cross_val_score(ovr, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 71.92%, std 0.37.


# Prediction

In [15]:
y_pred = ovr.predict(test_vectorized)

# Submit

In [16]:
sub = pd.read_csv('sampleSubmission.csv')

In [17]:
sub['Sentiment'] = y_pred

In [18]:
sub.to_csv('Dinh_Nguyen_submit.csv',index=False)