In [96]:
import pandas as pd

In [97]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [98]:
split1 = pd.read_csv('split_1/train.tsv', sep='\t')

In [99]:
split1.head()

Unnamed: 0,id,sentiment,review
0,1,1,Naturally in a film who's main themes are of m...
1,4,0,Afraid of the Dark left me with the impression...
2,7,0,This has to be one of the biggest misfires eve...
3,8,0,"This is one of those movies I watched, and won..."
4,17,0,This movie was dreadful. Biblically very inacc...


In [100]:
split1['sentiment'] = split1['sentiment'].astype('int')

In [101]:
split1.dtypes

id            int64
sentiment     int32
review       object
dtype: object

In [102]:
## Review cleanup (this process takes a significant amount of time)
# Removing stop words, @ mentions, webpages and special characters

from nltk.corpus import stopwords # nltk.download('stopwords') before importing
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def clean(review):
    stage1 = [word for word in review.lower().split() if word not in stopwords.words('english')] # stopword removal
    return ' '.join(stage1)

In [103]:
%%time

split1['review_cleaned'] = split1['review'].apply(clean)

Wall time: 23min 7s


In [104]:
train_1x = split1['review_cleaned']
train_1y = split1['sentiment']

In [105]:
## Analysis

from sklearn.feature_extraction.text import TfidfVectorizer

In [154]:
vector = TfidfVectorizer(max_features = 2000, ngram_range = (1, 2), stop_words = 'english')
%time vector.fit(train_1x)

Wall time: 16.9 s


TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')

In [147]:
vector = TfidfVectorizer(vocabulary = vocab, ngram_range = (1, 2), stop_words = 'english')
%time vector.fit(train_1x)

Wall time: 4.71 s


TfidfVectorizer(ngram_range=(1, 2), stop_words='english',
                vocabulary=dict_keys(['naturally', 'film', 'main', 'themes', 'loss', 'surprising', 'rated', 'highly', 'older', 'viewers', 'younger', 'ones', 'enjoy', 'pace', 'constant', 'characters', 'engaging', 'relationships', 'natural', 'showing', 'need', 'tears', 'emotion', 'fear', 'violence', 'short', 'story', 'ready', 'perfect', 'sm..., 'joan', 'spanish', 'joseph', 'directing', 'gang', 'anthony', 'context', 'speech', 'dollars', 'br overall', 'heads', 'felt like', 'twist', 'priest', 'fault', 'sucks', 'batman', 'parody', 'featuring', 'eat', 'test', 'andy', 'thoughts', 'stone', 'danny', 'genuinely', 'pleasure', 'grand', 'halloween', 'author', 'allen', 'river', 'howard', 'corny', 'nonsense', 'alex', 'surely', 'jackson']))

In [155]:
train_1x_transformed = vector.transform(train_1x)

In [156]:
train_1x_transformed

<25000x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1427174 stored elements in Compressed Sparse Row format>

In [109]:
from sklearn.naive_bayes import MultinomialNB

In [110]:
from sklearn.linear_model import LogisticRegression

In [136]:
from xgboost import XGBClassifier

In [135]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [157]:
xgb = XGBClassifier()

In [112]:
nb = MultinomialNB()

In [150]:
lg = LogisticRegression()

In [151]:
lg.fit(train_1x_transformed, train_1y)

LogisticRegression()

In [158]:
xgb.fit(train_1x_transformed, train_1y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [115]:
nb.fit(train_1x_transformed, train_1y)

MultinomialNB()

In [116]:
nb.predict(train_1x_transformed[0])

array([1])

In [117]:
nb.predict_proba(train_1x_transformed[0])

array([[0.08701922, 0.91298078]])

In [118]:
nb.predict_log_proba(train_1x_transformed[0])

array([[-2.44162628, -0.09104045]])

In [119]:
len(vector.vocabulary_)

2000

In [144]:
vocab = vector.vocabulary_.keys()

In [121]:
### Test it

split1_test_x = pd.read_csv('split_1/test.tsv', sep = '\t')
split1_test_y = pd.read_csv('split_1/test_y.tsv', sep = '\t')

In [122]:
split1_test_x['review_cleaned'] = split1_test_x['review'].apply(clean)

In [123]:
split_id = split1_test_x['id'].to_numpy()

In [124]:
split_id

array([  598, 12048, 40908, ...,  3620, 24858, 13068], dtype=int64)

In [125]:
test_1x = vector.transform(split1_test_x['review_cleaned'])

In [126]:
test_1y = split1_test_y['sentiment']

In [127]:
pred1 = nb.predict(test_1x)

In [145]:
pred1_prob = lg.predict_proba(test_1x)

In [159]:
pred1_prob = xgb.predict_proba(test_1x)

In [129]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [130]:
print('Accuracy - {}'.format(accuracy_score(test_1y, pred1)))
print('Confusion matrix - {}'.format(confusion_matrix(test_1y, pred1)))

Accuracy - 0.84548
Confusion matrix - [[10374  2153]
 [ 1710 10763]]


In [131]:
len(pred1_prob)

25000

In [132]:
len(split_id)

25000

In [160]:
file = open("mysubmission.txt","w")
file.write('"id"    "prob"')
file.write("\n")
num = len(split_id)
for x,y in enumerate(split_id):
    string = str(y) + "    "+ str(pred1_prob[x][1])
    file.write(string)
    if x != num - 1:
        file.write("\n")
file.close()
    