In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
data = pd.read_csv("train.csv", encoding = 'latin')
print(data.head())

   ItemID  Sentiment                                      SentimentText
0       1          0                       is so sad for my APL frie...
1       2          0                     I missed the New Moon trail...
2       3          1                            omg its already 7:30 :O
3       4          0            .. Omgaga. Im sooo  im gunna CRy. I'...
4       5          0           i think mi bf is cheating on me!!!   ...


In [3]:
data.dropna(inplace = True)
print(data.shape)
print(data['Sentiment'].mean())

(99989, 3)
0.5646321095320486


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['SentimentText'], data['Sentiment'], test_size=0.15, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.235, random_state=1)
print("X_train:", X_train.shape)
print("X_val:", X_val.shape)
print("X_test:", X_test.shape)

X_train: (65017,)
X_val: (19973,)
X_test: (14999,)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
countVect = CountVectorizer().fit(X_train)
print(countVect.get_feature_names()[::5000])
print("Feature count:", len(countVect.get_feature_names()))

['00', 'abit', 'allanfrancisco', 'anniespajamas', 'ayekaygee', 'biggbybob', 'bryantma', 'character', 'companymancomic', 'drives', 'gstlrf', 'kip', 'n8ai', 'qdo', 'soulfish', 'unforgettable']
Feature count: 78872


In [9]:
from sklearn.linear_model import LogisticRegression

In [9]:
X_train_vectorized = countVect.transform(X_train)
model = LogisticRegression(C = 0.71)
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=0.71, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
predictions = model.predict(countVect.transform(X_val))

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [12]:
print("Accuracy score", accuracy_score(y_val, predictions))
print("f1_score(weighted):", f1_score(y_val, predictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, predictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, predictions, average = 'weighted'))

Accuracy score 0.7696890802583488
f1_score(weighted): 0.7678701423516031
average_precision_score(weighted): 0.7431107020580949
recall_score(weighted): 0.7696890802583488


In [13]:
testPredictions = model.predict(countVect.transform(X_test))

In [14]:
print("Accuracy score", accuracy_score(y_test, testPredictions))
print("f1_score(weighted):", f1_score(y_test, testPredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testPredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testPredictions, average = 'weighted'))

Accuracy score 0.7720514700980066
f1_score(weighted): 0.7704230429407638
average_precision_score(weighted): 0.7429734974934948
recall_score(weighted): 0.7720514700980066


In [15]:
feature_names = np.array(countVect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['sad' 'inaperfectworld' 'dontyouhate' 'sucks' 'poor' 'sadly' 'rip'
 'cancelled' 'missing' 'bummer']

Largest Coefs: 
['musicmonday' 'welcome' 'bear' 'congrats' 'followfriday' 'worries'
 'congratulations' 'worry' 'woohoo' 'thanks']


In [16]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|+*,#;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z @_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
    text = re.sub(BAD_SYMBOLS_RE, "", text)
    text_tokens = word_tokenize(text)
    filtered_sentence = ""
    for w in text_tokens:
        if w not in STOPWORDS:
            filtered_sentence += w + " "
    return filtered_sentence[:-1]

In [17]:
X2_train = [text_prepare(x) for x in X_train]
X2_val = [text_prepare(x) for x in X_val]
X2_test = [text_prepare(x) for x in X_test]

In [18]:
words_counts = {}

for tweet in X2_train:
    for word in tweet.split():
        words_counts[word] = words_counts.setdefault(word, 0) + 1

most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

print(most_common_words)

[('@', 58959), ('im', 7291), ('good', 3805), ('like', 3611), ('get', 3446), ('u', 3329), ('lol', 3236), ('dont', 3174), ('quot', 3128), ('know', 2935)]


In [19]:
DICT_SIZE = 5000

most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]

WORDS_TO_INDEX = {}
i = 0
for entry in most_common_words:
    WORDS_TO_INDEX[entry[0]] = i
    i += 1

INDEX_TO_WORDS = {}
i = 0
for entry in most_common_words:
    INDEX_TO_WORDS[i] = entry[0]
    i += 1

ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]] += 1
    return result_vector


In [20]:
from scipy import sparse as sp_sparse

In [21]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (65017, 5000)
X_val shape  (19973, 5000)
X_test shape  (14999, 5000)


In [22]:
classifier = LogisticRegression(C = 0.5).fit(X_train_mybag, y_train)

In [23]:
predictions2 = classifier.predict(X_val_mybag)

In [24]:
print("Accuracy score", accuracy_score(y_val, predictions2))
print("f1_score(weighted):", f1_score(y_val, predictions2, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, predictions2, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, predictions2, average = 'weighted'))

Accuracy score 0.7551194112051269
f1_score(weighted): 0.7519824642903692
average_precision_score(weighted): 0.727193898957285
recall_score(weighted): 0.7551194112051269


In [25]:
testPredictions2 = classifier.predict(X_test_mybag)

In [26]:
print("Accuracy score", accuracy_score(y_test, testPredictions2))
print("f1_score(weighted):", f1_score(y_test, testPredictions2, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testPredictions2, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testPredictions2, average = 'weighted'))

Accuracy score 0.7576505100340023
f1_score(weighted): 0.7550034504487017
average_precision_score(weighted): 0.7276347273884347
recall_score(weighted): 0.7576505100340023


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidfVect = TfidfVectorizer(min_df=5).fit(X_train)
print(len(tfidfVect.get_feature_names()))

9308


In [29]:
tfX_train_vectorized = tfidfVect.transform(X_train)

tfmodel = LogisticRegression(C = 1.2)
tfmodel.fit(tfX_train_vectorized, y_train)

LogisticRegression(C=1.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
tfpredictions = tfmodel.predict(tfidfVect.transform(X_val))

In [31]:
print("Accuracy score", accuracy_score(y_val, tfpredictions))
print("f1_score(weighted):", f1_score(y_val, tfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, tfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, tfpredictions, average = 'weighted'))

Accuracy score 0.7696390126671006
f1_score(weighted): 0.7677517288080807
average_precision_score(weighted): 0.7428221173739502
recall_score(weighted): 0.7696390126671006


In [32]:
testTfpredictions = tfmodel.predict(tfidfVect.transform(X_test))

In [33]:
print("Accuracy score", accuracy_score(y_test, testTfpredictions))
print("f1_score(weighted):", f1_score(y_test, testTfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testTfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testTfpredictions, average = 'weighted'))

Accuracy score 0.7732515501033402
f1_score(weighted): 0.7716449986387606
average_precision_score(weighted): 0.7440795684885831
recall_score(weighted): 0.7732515501033402


In [34]:
tffeature_names = np.array(tfidfVect.get_feature_names())

sorted_tfidf_index = tfX_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(tffeature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(tffeature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['ðµð½ñ' 'm2e' 'shareholder' 'tweeterfollow' 'tweeteradder' 'relieve'
 'h01jg' '05' 'casino' 'longestpoemintheworld']

Largest tfidf: 
['øªù' 'sniff' 'was' 'cry' 'crucifire' 'gutted' 'guys' 'ace' 'wants' 'smh']


In [35]:
tfsorted_coef_index = tfmodel.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(tffeature_names[tfsorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(tffeature_names[tfsorted_coef_index[:-11:-1]]))

Smallest Coefs:
['sad' 'sorry' 'sucks' 'miss' 'poor' 'wish' 'inaperfectworld' 'sick'
 'sadly' 'missed']

Largest Coefs: 
['thanks' 'welcome' 'great' 'thank' 'followfriday' 'glad' 'musicmonday'
 'congrats' 'awesome' 'worry']


In [36]:
print(tfmodel.predict(tfidfVect.transform(['love',
                                    'working'])))

[1 0]


In [10]:
ngramVect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
print(len(ngramVect.get_feature_names()))

26804


In [11]:
ngramX_train_vectorized = ngramVect.transform(X_train)
ngramModel = LogisticRegression(C = 0.3)
ngramModel.fit(ngramX_train_vectorized, y_train)

LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
ngramPredictions = ngramModel.predict(ngramVect.transform(X_val))

In [15]:
print("Accuracy score", accuracy_score(y_val, ngramPredictions))
print("f1_score(weighted):", f1_score(y_val, ngramPredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, ngramPredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, ngramPredictions, average = 'weighted'))

Accuracy score 0.7764482050768537
f1_score(weighted): 0.7744223544637212
average_precision_score(weighted): 0.7480119727722014
recall_score(weighted): 0.7764482050768537


In [16]:
testNgramPredictions = ngramModel.predict(ngramVect.transform(X_test))

In [17]:
print("Accuracy score", accuracy_score(y_test, testNgramPredictions))
print("f1_score(weighted):", f1_score(y_test, testNgramPredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testNgramPredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testNgramPredictions, average = 'weighted'))

Accuracy score 0.7769184612307487
f1_score(weighted): 0.7751548322598547
average_precision_score(weighted): 0.746587773774744
recall_score(weighted): 0.7769184612307487


In [18]:
ngramfeature_names = np.array(ngramVect.get_feature_names())

ngramsorted_coef_index = ngramModel.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(ngramfeature_names[ngramsorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(ngramfeature_names[ngramsorted_coef_index[:-11:-1]]))

Smallest Coefs:
['sad' 'inaperfectworld' 'poor' 'sucks' 'dontyouhate' 'missing' 'sadly'
 'miss' 'sick' 'rip']

Largest Coefs: 
['cant wait' 'no problem' 'welcome' 'thanks' 'musicmonday' 'followfriday'
 'congrats' 'congratulations' 'no prob' 'sweet']


In [19]:
print(ngramModel.predict(ngramVect.transform(['phone is good', 'horrible person'])))

[1 0]


In [45]:
from sklearn import svm

In [46]:
clfVect = CountVectorizer(min_df=5).fit(X_train)

In [10]:
print("Feature count:", len(clfVect.get_feature_names()))

Feature count: 9308


In [46]:
clfX_train_vectorized = clfVect.transform(X_train) 

In [58]:
clf = svm.NuSVC(kernel = 'poly', degree = 2)
clf.fit(clfX_train_vectorized, y_train)

NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
   max_iter=-1, nu=0.5, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False)

In [59]:
clfpredictions = clf.predict(clfVect.transform(X_val))

In [60]:
print("Accuracy score", accuracy_score(y_val, clfpredictions))
print("f1_score(weighted):", f1_score(y_val, clfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, clfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, clfpredictions, average = 'weighted'))

Accuracy score 0.5376758624142592
f1_score(weighted): 0.42339713718934374
average_precision_score(weighted): 0.5585155187941309
recall_score(weighted): 0.5376758624142592


In [61]:
TestClfpredictions = clf.predict(clfVect.transform(X_test))

In [62]:
print("Accuracy score", accuracy_score(y_test, TestClfpredictions))
print("f1_score(weighted):", f1_score(y_test, TestClfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, TestClfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, TestClfpredictions, average = 'weighted'))

Accuracy score 0.5273018201213414
f1_score(weighted): 0.4126559747863872
average_precision_score(weighted): 0.5510158445950867
recall_score(weighted): 0.5273018201213414


In [52]:
clffeature_names = np.array(clfVect.get_feature_names())

clfsorted_coef_index = clf.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(clffeature_names[clfsorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(clffeature_names[clfsorted_coef_index[:-11:-1]]))

Smallest Coefs:
['farrah' 'leak' 'no1' 'dontyouhate' '½o' 'upsetting' 'raincheck' 'boooo'
 'heartbreaking' 'canceled']

Largest Coefs: 
['geeks' 'whew' 'char_x3' 'iluu' 'arabidopsis' 'encourage' 'combine'
 'rollin' 'elmo' 'carpool']


In [None]:
repeat = True
while repeat == True:
    string = input("Enter custom string:(0 to exit)")
    if string != '0':
        print(ngramModel.predict(ngramVect.transform([string])))
    else:
        repeat = False