In [42]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Additional TFIDF ANALYSIS
Additional analysis on the data. Tried KMeans clustering to see what it threw up - Long story short, the clusters seem to tie to genre of the movie and the Location rather than sentiment

Also, changing the sentiment to just Positive, Negative, Neutral increases the accuracy of baseline model (TFIDF+NB) from 60% to 70%

TFIDF+LinearSVM = 73.5%

In [2]:
df = pd.read_csv('/kaggle/input/privatetrain/train.tsv', sep='\t')
df.head()

In [3]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [4]:
df['Phrase'] = df['Phrase'].str.replace('\d+', '') # remove digits
df['Phrase'] = df['Phrase'].str.replace('[^\w\s]', '') # remove punctuation

In [5]:
my_stop_words = ENGLISH_STOP_WORDS.union(['film', 'movie', 'cinema', 'theatre', 'hollywood'])

In [6]:
vectorizer = TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 1))
vectors = vectorizer.fit_transform(df.Phrase)
feature_names = vectorizer.get_feature_names()

In [7]:
terms = ['']*len(feature_names)

In [8]:
len(terms)

In [None]:
print(vectors.shape)

In [None]:
for i, feature in enumerate(vectorizer.get_feature_names()):
    terms[i] = feature
    #print(i, feature)

In [None]:
print(terms[3])

In [None]:
#print(vectors)

print(terms[5904])
print(terms[5880])
print(terms[157])

print(terms[12296])



In [None]:
print(vectors[4])


In [None]:
print (df.iloc[4].Phrase)
print (df.iloc[4].Sentiment)


In [None]:
column_names = ["TermName", "0", "1", "2", "3", "4"]
df = pd.DataFrame(columns = column_names)
'''
for txt, tokenid in vectors:
    sent = df.iloc[txt].Sentiment
    if tokenid = "0":
'''

   
    
    

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
top_tfidf_feats()

## KMeans Model

In [None]:
clusters = 3

In [None]:
model = KMeans(n_clusters = clusters, init='k-means++', max_iter=100, n_init=1)

In [None]:
model.fit(vectors)

In [None]:
centroids = model.cluster_centers_.argsort()[:, ::-1]

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")


In [None]:
Y = vectorizer.transform(["started bad got worse"])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["great performance by a great director"])
prediction = model.predict(Y)
print(prediction)

## Naive Bayes

In [9]:
df['Sentiment'] = df['Sentiment'].replace(0, 1)
df['Sentiment'] = df['Sentiment'].replace(4, 3)

In [10]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [11]:
X = df['Phrase'] 
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
text_clf_nb_tf = Pipeline([('vect', TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 3))), ('clf', MultinomialNB())])

In [14]:
text_clf_nb_tf.fit(X_train, y_train)

In [15]:
data = ["ultimately fell flat"]

text_clf_nb_tf.predict_proba(data)

In [16]:
predictions_nb_tf_proba = text_clf_nb_tf.predict_proba(X_test)

In [17]:
len(predictions_nb_tf_proba)

In [18]:
a = []
num_grt_80 = 0
for i in predictions_nb_tf_proba:
    for p in i:
        if p>0.8:
            num_grt_80 += 1
            a.append(i)

In [19]:
print(num_grt_80)
#print(a)
# 1879 rows out of 51500 with a probability > 90
# 7863 rows out of 51500 with a probability > 80

In [20]:
predictions_nb_tf = text_clf_nb_tf.predict(X_test)

In [21]:
#print a Confusion Matrix
print(metrics.confusion_matrix(y_test,predictions_nb_tf))

In [22]:
# Print a classification report
print(metrics.classification_report(y_test,predictions_nb_tf))

In [23]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions_nb_tf))

## LinearSVC

In [43]:
text_clf_svc_tf = Pipeline([('vect', TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 3))), ('clf', LinearSVC(max_iter=10000))])

#text_clf_svc_tf = Pipeline([('vect', TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 3))), ('clf', LinearSVC(max_iter=10000, class_weight='balanced'))])

In [47]:
# GridSearchCV to iterate over
param_grid = {
    'vect__max_df':[0.8,0.9,1.0],
    'clf__C':[0.1,1.0],
    'clf__class_weight': ['balanced',None]
}

In [48]:
# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(text_clf_svc_tf, cv=3, param_grid=param_grid)
grid.fit(X_train,y_train)

In [49]:
# summarize results
print("Best: %f using %s" % (grid.best_score_, 
    grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [50]:
text_clf_svc_tf = Pipeline([('vect', TfidfVectorizer(stop_words=my_stop_words, max_df=0.8, ngram_range=(1, 3))), ('clf', LinearSVC(max_iter=10000, C = 1.0, class_weight=None))])

In [51]:
text_clf_svc_tf.fit(X_train, y_train)

In [52]:
predictions_svc_tf = text_clf_svc_tf.predict(X_test)

In [53]:
#print a Confusion Matrix
print(metrics.confusion_matrix(y_test,predictions_svc_tf))

In [54]:
# Print a classification report
print(metrics.classification_report(y_test,predictions_svc_tf))

In [55]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions_svc_tf))
# accuracy - .735 without classweight = balanced
# .731 with classweight = balanced