In [41]:
import pandas as pd

In [42]:
# download nltk stopwords
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Yangliang
[nltk_data]     Lu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
def read_data(i):
	train = pd.read_csv('data/split_' + str(i) + '/train.tsv', sep='\t')
	test = pd.read_csv('data/split_' + str(i) + '/test.tsv', sep='\t')
	test_y = pd.read_csv('data/split_' + str(i) + '/test_y.tsv', sep='\t')

	train_label = train['sentiment']
	train_review =  train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

	test_label = test_y['sentiment']
	test_review = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

	return train_review, train_label, test_review, test_label


In [44]:
# read myvocab.txt
vocab = pd.read_csv('myvocab.txt', sep='\t')
vocab['Feature'].head()

0    supposed comedy
1       worse acting
2       please waste
3          instead 1
4         forwarding
Name: Feature, dtype: object

In [45]:
def vectorizer(min_df = None, max_df = None, ngram_range = (1, 2)):
    vector = TfidfVectorizer(
        preprocessor=lambda x: x.lower(),  # Convert to lowercase
        stop_words=stopwords,             # Remove stop words
        ngram_range=ngram_range,               # Use 1- to 4-grams
          # Use word tokenizer: See Ethan's comment below
    )
    if min_df is not None:
        vector.set_params(min_df=min_df)
    if max_df is not None:
        vector.set_params(max_df=max_df)
    return vector

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pickle import load
def fit_vectorizer(train_review, test_review, vocab):
	vector = vectorizer()
	# vector = load(open('vectorizer.pkl', 'rb'))
	vector.fit(vocab)
	train = vector.transform(train_review)
	test = vector.transform(test_review)
	return train, test

In [47]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV
def score(train_review, train_label, test_review, test_label):
	# fit with logistic regression for classification
	from sklearn.metrics import accuracy_score

	model = LogisticRegressionCV(cv=5, max_iter=10000, n_jobs=-1, solver='liblinear')
	model.fit(train_review, train_label)
	# calculate AUC score
	pred = model.predict_proba(test_review)

	return roc_auc_score(test_label, pred[:, 1])

In [48]:
def main(i, vocab):
	train_review, train_label, test_review, test_label = read_data(i)

	train, test = fit_vectorizer(train_review, test_review, vocab['Feature'])
	s = score(train, train_label, test, test_label)
	result = 'split_' + str(i) + ' ' + str(s) + '\n'
	return result

In [49]:
# using joblib to parallelize the process
from joblib import Parallel, delayed, cpu_count
count = cpu_count()

results = Parallel(n_jobs=count)(delayed(main)(i, vocab) for i in range(1, 6))

In [50]:
for result in results:
	print(result)

split_1 0.9607600025218678

split_2 0.9611106214308613

split_3 0.9604750611738162

split_4 0.9613678728754387

split_5 0.9608693665237437

