In [113]:
import pandas as pd

In [114]:
def read_data(i):
	train = pd.read_csv('data/split_' + str(i) + '/train.tsv', sep='\t')
	test = pd.read_csv('data/split_' + str(i) + '/train.tsv', sep='\t')
	test_y = pd.read_csv('data/split_' + str(i) + '/train.tsv', sep='\t')

	train_label = train['sentiment']
	train_review =  train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

	test_label = test_y['sentiment']
	test_review = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

	return train_review, train_label, test_review, test_label
# read myvocab.txt
vocab = pd.read_csv('myvocab.txt')

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer

def fit_vectorizer(train_review, test_review, vocab):
	vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,  # Converts all text to lowercase by default
    ngram_range=(1, 4),  # Extracts unigrams only by default
	preprocessor=lambda x: x.lower(),  # Convert to lowercase
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
	)
	vectorizer.fit(vocab.values.flatten())
	train_review = vectorizer.transform(train_review)
	test_review = vectorizer.transform(test_review)
	return train_review, test_review

In [116]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV
def score(train_review, train_label, test_review, test_label):
	# fit with logistic regression for classification
	from sklearn.metrics import accuracy_score

	model = LogisticRegressionCV(cv=5, max_iter=10000, n_jobs=-1)
	model.fit(train_review, train_label)
	# calculate AUC score
	pred = model.predict_proba(test_review)

	return roc_auc_score(test_label, pred[:, 1])

In [117]:
vocab_1000 =  pd.read_csv('myvocab_1000.txt')
vocab_2000 =  pd.read_csv('myvocab_2000.txt')
vocab_3000 =  pd.read_csv('myvocab_3000.txt')
vocabs = [vocab_1000, vocab_2000, vocab_3000]

In [122]:
vocab_1000.shape

(1000, 1)

In [119]:
train_review, train_label, test_review, test_label = read_data(1)

for vocab in vocabs:
	train, test = fit_vectorizer(train_review.copy(), test_review.copy(), vocab['Feature'])
	print(score(train, train_label, test, test_label))

0.9808027392332601
0.994410309920742
0.9937656477130061


In [125]:
def main(i, vocab):
	train_review, train_label, test_review, test_label = read_data(i)

	train, test = fit_vectorizer(train_review.copy(), test_review.copy(), vocab['Feature'])
	s = score(train, train_label, test, test_label)
	result = 'split_' + str(i) + ' ' + str(s) + '\n'
	return result

In [126]:
# using joblib to parallelize the process
from joblib import Parallel, delayed, cpu_count
count = cpu_count()

results = Parallel(n_jobs=count)(delayed(main)(i, vocab_1000) for i in range(1, 6))

In [127]:
for result in results:
	print(result)

split_1 0.9808027392332601

split_2 0.9711022958994868

split_3 0.9718987133018542

split_4 0.9713212488455994

split_5 0.9713832943223124

