In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer 
import nltk, ssl 
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer 
from sklearn.feature_extraction.text import CountVectorizer 

In [2]:
data = pd.read_csv('data.tsv', sep='\t')

df = pd.DataFrame(data.groupby('Sentiment').count()['Phrase'].values, columns=['Numbers']) ['Numbers']
df.head()

0     7072
1    27273
2    79582
3    32927
4     9206
Name: Numbers, dtype: int64

In [3]:
import re
try:  _create_unverified_https_context = ssl._create_unverified_context 
except AttributeError: pass 
else: ssl._create_default_https_context = _create_unverified_https_context 
nltk.download('stopwords') 

stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

phrases1 = data['Phrase'].values
sentiments = data['Sentiment'].values

phrases2 = []
for x in phrases1:
    phrases2.append(" ".join([stemmer.stem(i) for i in nltk.word_tokenize(x.lower()) if re.match(r'(?!.*(.)\1{2,})^[a-zA-Z]*$', i) and i not in stopwords]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yerlan2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(phrases2, sentiments, test_size=.3, random_state=42)

tfidfv = TfidfVectorizer()
cv = CountVectorizer()

X_train_tfidfv = tfidfv.fit_transform(X_train)
X_test_tfidfv  = tfidfv.transform(X_test)

X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=50, max_iter=1024).fit(X_train_tfidfv, y_train)
score = clf.score(X_test_tfidfv, y_test)
print(f"LogisticRegression TfidfVectorizer accuracy: {score*100:.4f}%")

clf = LogisticRegression(random_state=50, max_iter=1024).fit(X_train_cv, y_train)
score = clf.score(X_test_cv, y_test)
print(f"LogisticRegression CountVectorizer accuracy: {score*100:.4f}%")

LogisticRegression TfidfVectorizer accuracy: 62.1278%
LogisticRegression CountVectorizer accuracy: 63.2342%


In [6]:
from sklearn import svm

clf = svm.LinearSVC(random_state=40, C=1024, dual=False).fit(X_train_tfidfv, y_train)
score = clf.score(X_test_tfidfv, y_test)
print(f"SVM TfidfVectorizer accuracy: {score*100:.4f}%")

clf = svm.LinearSVC(random_state=40, C=1024, dual=False).fit(X_train_cv, y_train)
score = clf.score(X_test_cv, y_test)
print(f"SVM CountVectorizer accuracy: {score*100:.4f}%")

SVM TfidfVectorizer accuracy: 61.9527%
SVM CountVectorizer accuracy: 61.6066%


In [7]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidfv, y_train)
score = clf.score(X_test_tfidfv, y_test)
print(f"NaiveBayes TfidfVectorizer accuracy: {score*100:.4f}%")

clf = MultinomialNB().fit(X_train_cv, y_train)
score = clf.score(X_test_cv, y_test)
print(f"NaiveBayes CountVectorizer accuracy: {score*100:.4f}%")

NaiveBayes TfidfVectorizer accuracy: 57.5121%
NaiveBayes CountVectorizer accuracy: 60.6818%
