In [1]:
import pandas as pd

In [2]:
def read_data():
    train = pd.read_csv('train.tsv', sep='\t')
    test = pd.read_csv('train.tsv', sep='\t')
    test_y = pd.read_csv('train.tsv', sep='\t')

    train_label = train['sentiment']
    train_review =  train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

    test_label = test_y['sentiment']
    test_review = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

    return train_review, train_label, test_review, test_label
# read myvocab.txt
vocab = pd.read_csv('myvocab_1000.txt')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

def fit_vectorizer(train_review, test_review, vocab):
    vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,  # Converts all text to lowercase by default
    ngram_range=(1, 4),  # Extracts unigrams only by default
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
    )
    vectorizer.fit(vocab.values.flatten())
    train_review = vectorizer.transform(train_review)
    test_review = vectorizer.transform(test_review)
    return train_review, test_review

In [11]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV
def score(train_review, train_label, test_review, test_label):
    # fit with logistic regression for classification
    from sklearn.metrics import accuracy_score

    model = LogisticRegressionCV(cv=5, max_iter=10000, n_jobs=-1)
    model.fit(train_review, train_label)
    # calculate AUC score
    pred = model.predict_proba(test_review)

    return roc_auc_score(test_label, pred[:, 1]), pred

In [5]:
vocab_1000 =  pd.read_csv('myvocab_1000.txt')
vocab_2000 =  pd.read_csv('myvocab_2000.txt')
vocab_3000 =  pd.read_csv('myvocab_3000.txt')
vocabs = [vocab_1000, vocab_2000, vocab_3000]

In [6]:
vocab_1000.shape

(1000, 1)

In [16]:
train_review, train_label, test_review, test_label = read_data()

for vocab in vocabs:
    train_score, test_score = fit_vectorizer(train_review.copy(), test_review.copy(), vocab['Feature'])
    auc, pred = score(train_score, train_label, test, test_label)
#     print(auc, pred)

0.9711199154283097 [[0.97617763 0.02382237]
 [0.14500221 0.85499779]
 [0.03833007 0.96166993]
 ...
 [0.99395236 0.00604764]
 [0.58238597 0.41761403]
 [0.93501374 0.06498626]]
0.9816587937090717 [[0.98747491 0.01252509]
 [0.14583294 0.85416706]
 [0.0782719  0.9217281 ]
 ...
 [0.99707924 0.00292076]
 [0.69901774 0.30098226]
 [0.86055266 0.13944734]]
0.9862876416944842 [[0.99052402 0.00947598]
 [0.62515521 0.37484479]
 [0.04798215 0.95201785]
 ...
 [0.99214822 0.00785178]
 [0.65546201 0.34453799]
 [0.81391356 0.18608644]]


In [8]:
def main(i, vocab):
    train_review, train_label, test_review, test_label = read_data()

    train, test = fit_vectorizer(train_review.copy(), test_review.copy(), vocab['Feature'])
    s = score(train, train_label, test, test_label)
    result = 'split_' + str(i) + ' ' + str(s) + '\n'
    return result

In [9]:
# # using joblib to parallelize the process
# from joblib import Parallel, delayed, cpu_count
# count = cpu_count()

# results = Parallel(n_jobs=count)(delayed(main)(i, vocab_1000) for i in range(1, 6))
result = main(1,vocab_1000)

In [28]:
import csv
csv_file_path = "output.csv"

# Write the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(['id', 'prob'])

    # Write the data
    for id_value, prob_value in zip(test_review, pred):
        writer.writerow([id_value, prob_value[0]])

  (0, 3078)	0.11831152736322537
  (0, 3070)	0.11831152736322537
  (0, 3027)	0.11831152736322537
  (0, 2999)	0.08963711646044083
  (0, 2996)	0.11831152736322537
  (0, 2869)	0.10844723768335114
  (0, 2785)	0.09061896517782843
  (0, 2782)	0.11831152736322537
  (0, 2763)	0.10267699812437403
  (0, 2715)	0.2250825756084965
  (0, 2598)	0.11831152736322537
  (0, 2584)	0.11831152736322537
  (0, 2562)	0.11831152736322537
  (0, 2561)	0.11254128780424826
  (0, 2534)	0.11831152736322537
  (0, 2433)	0.10527164569929216
  (0, 2411)	0.11831152736322537
  (0, 2396)	0.10844723768335114
  (0, 2383)	0.11831152736322537
  (0, 2382)	0.10844723768335114
  (0, 2223)	0.11831152736322537
  (0, 2214)	0.10844723768335114
  (0, 1832)	0.19485518453243117
  (0, 1792)	0.11831152736322537
  (0, 1753)	0.10527164569929216
  :	:
  (24999, 2392)	0.12121570860844773
  (24999, 2384)	0.11231405429084135
  (24999, 2149)	0.1322414100371575
  (24999, 2116)	0.1322414100371575
  (24999, 2107)	0.39672423011147245
  (24999, 2035)	0

# 