In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

## Additional TFIDF ANALYSIS
Additional analysis on the data. Tried KMeans clustering to see what it threw up - Long story short, the clusters seem to tie to genre of the movie and the Location rather than sentiment

Also, changing the sentiment to just Positive, Negative, Neutral increases the accuracy of baseline model (TFIDF+NB) from 60% to 70%

In [None]:
df = pd.read_csv('/kaggle/input/privatetrain/train.tsv', sep='\t')
df.head()

In [None]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [None]:
df['Phrase'] = df['Phrase'].str.replace('\d+', '') # remove digits
df['Phrase'] = df['Phrase'].str.replace('[^\w\s]', '') # remove punctuation

In [None]:
my_stop_words = ENGLISH_STOP_WORDS.union(['film', 'movie', 'cinema', 'theatre', 'hollywood'])

In [None]:
vectorizer = TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 3))
vectors = vectorizer.fit_transform(df.Phrase)
feature_names = vectorizer.get_feature_names()

In [None]:
terms = ['']*len(feature_names)

In [None]:
len(terms)

In [None]:
print(vectors.shape)

In [None]:
for i, feature in enumerate(vectorizer.get_feature_names()):
    terms[i] = feature
    #print(i, feature)

In [None]:
#print(vectors)
print(terms[25624])
print(terms[1503])
print(terms[1502])
print(vectors[1])


In [None]:
print (df.iloc[0])

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
top_tfidf_feats()

## KMeans Model

In [None]:
clusters = 3

In [None]:
model = KMeans(n_clusters = clusters, init='k-means++', max_iter=100, n_init=1)

In [None]:
model.fit(vectors)

In [None]:
centroids = model.cluster_centers_.argsort()[:, ::-1]

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")


In [None]:
Y = vectorizer.transform(["started bad got worse"])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["great performance by a great director"])
prediction = model.predict(Y)
print(prediction)

## Naive Bayes

In [None]:
df['Sentiment'] = df['Sentiment'].replace(0, 1)
df['Sentiment'] = df['Sentiment'].replace(4, 3)

In [None]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [None]:
X = df['Phrase'] 
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
text_clf_nb_tf = Pipeline([('vect', TfidfVectorizer(stop_words=my_stop_words, ngram_range=(1, 3))), ('clf', MultinomialNB())])

In [None]:
text_clf_nb_tf.fit(X_train, y_train)

In [None]:
data = ["uptimately fell flat"]

text_clf_nb_tf.predict_proba(data)

In [None]:
predictions_nb_tf_proba = text_clf_nb_tf.predict_proba(X_test)

In [None]:
len(predictions_nb_tf_proba)

In [None]:
a = []
num_grt_80 = 0
for i in predictions_nb_tf_proba:
    for p in i:
        if p>0.8:
            num_grt_80 += 1
            a.append(i)

In [None]:
print(num_grt_80)
#print(a)
# 1879 rows out of 51500 with a probability > 90
# 7863 rows out of 51500 with a probability > 80

In [None]:
predictions_nb_tf = text_clf_nb_tf.predict(X_test)

In [None]:
#print a Confusion Matrix
print(metrics.confusion_matrix(y_test,predictions_nb_tf))

In [None]:
# Print a classification report
print(metrics.classification_report(y_test,predictions_nb_tf))

In [None]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions_nb_tf))