In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb 
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, accuracy_score
import re 
import time

In [2]:
def get_vectorizer(vocab_file):
    corpus = None
    with open(vocab_file, encoding='UTF-8') as f:
        corpus = f.readlines() 
    corpus = [w.strip() for w in corpus]
    voca = dict()
    for idx, term in enumerate(corpus):
        voca[term] = idx

    vectorizer = CountVectorizer(vocabulary=voca, ngram_range=(1, 10))
    return vectorizer


In [3]:
split_dirc = 'D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\split_{}\\'
cleaner = re.compile('<.*?>') 

In [4]:
vocab_file = r'D:\studying\UIUC courses\STAT542\project\Project3\splits\vocab_lasso_10_999.txt'
# vocab_file = 'D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\vocab_lasso_840.txt'
vectorizer = get_vectorizer(vocab_file)

In [5]:
len(vectorizer.get_feature_names())

999

In [9]:
for i in range(1, 6):
    start = time.time()
    print('split: ' + str(i))
    dirc = split_dirc.format(i)

    train_file = dirc + 'train.tsv'
    test_file = dirc + 'test.tsv'
    test_y_file = dirc + 'test_y.tsv'
    
    train = pd.read_csv(train_file, sep='\t', encoding='utf-8')
    train['review'] = train['review'].map(lambda s: re.sub(cleaner, '', s))

    test = pd.read_csv(test_file, sep='\t', encoding='utf-8')
    test['review'] = test['review'].map(lambda s: re.sub(cleaner, '', s))

    test_y = pd.read_csv(test_y_file, sep='\t', encoding='utf-8')

    X_train = vectorizer.transform(train['review'].values).toarray()
    X_train = pd.DataFrame(X_train, columns=vectorizer.get_feature_names())
    Y_train = train['sentiment']

    X_test = vectorizer.transform(test['review'].values).toarray()
    X_test = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())

    ridge = LogisticRegression(C=0.5, random_state=2021, max_iter=1000)
    ridge.fit(X_train, Y_train)

    true_y = test_y['sentiment'].values

    pred_test = ridge.predict_proba(X_test)

    auc = roc_auc_score(true_y, pred_test[:,1])
    end = time.time()
    print('auc:', auc)
    print('running time:', end - start)

split: 1
auc: 0.9588617088651888
running time: 73.72855758666992
split: 2
auc: 0.9586232726109628
running time: 73.30441880226135
split: 3
auc: 0.9579323132304199
running time: 73.77296924591064
split: 4
auc: 0.958935231318548
running time: 74.2836103439331
split: 5
auc: 0.9579574162223122
running time: 74.13735008239746


In [6]:
all_auc = []
all_time = []

In [7]:
for i in range(1, 6):
    start = time.time()
    print('split: ' + str(i))
    dirc = split_dirc.format(i)

    train_file = dirc + 'train.tsv'
    test_file = dirc + 'test.tsv'
    test_y_file = dirc + 'test_y.tsv'
    
    train = pd.read_csv(train_file, sep='\t', encoding='utf-8')
    train['review'] = train['review'].map(lambda s: re.sub(cleaner, '', s))

    test = pd.read_csv(test_file, sep='\t', encoding='utf-8')
    test['review'] = test['review'].map(lambda s: re.sub(cleaner, '', s))
    pred_id = test['id']

    test_y = pd.read_csv(test_y_file, sep='\t', encoding='utf-8')

    X_train = vectorizer.transform(train['review'].values).toarray()
    X_train = pd.DataFrame(X_train, columns=vectorizer.get_feature_names())
    Y_train = train['sentiment']

    X_test = vectorizer.transform(test['review'].values).toarray()
    X_test = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())

    ridge = LogisticRegression(C=0.5, random_state=2021, max_iter=1000)
    ridge.fit(X_train, Y_train)

    true_y = test_y['sentiment'].values

    pred_test = ridge.predict_proba(X_test)

    auc = roc_auc_score(true_y, pred_test[:,1])
    end = time.time()
    t = np.round(end - start, 2)
    all_auc.append(auc)
    all_time.append(t)
    print('auc:', auc)
    print('running time:', t)
    sub = dirc + 'mysubmission.txt'
    mysubmission = pd.DataFrame({'id': pred_id, 'prob': pred_test[:, 1]})
    mysubmission.to_csv(sub, sep='\t', index=False)
    

split: 1
auc: 0.9588617088651888
running time: 73.98
split: 2
auc: 0.9586232726109628
running time: 73.1
split: 3
auc: 0.9579323132304199
running time: 73.53
split: 4
auc: 0.958935231318548
running time: 73.74
split: 5
auc: 0.9579574162223122
running time: 74.02


In [8]:
all_auc

[0.9588617088651888,
 0.9586232726109628,
 0.9579323132304199,
 0.958935231318548,
 0.9579574162223122]

In [9]:
all_time

[73.98, 73.1, 73.53, 73.74, 74.02]

In [None]:
ridge.coef_.reshape(-1)

In [None]:
vectorizer.get_feature_names() 

In [15]:
coef_df = pd.DataFrame({'feature': vectorizer.get_feature_names(), 'coef': ridge.coef_.reshape(-1)})

In [18]:
coef_df.sort_values(by=['coef'])

Unnamed: 0,feature,coef
686,not recommend,-1.793241
501,not worth,-1.768052
643,mst3k,-1.641706
601,uninspired,-1.519235
472,redeeming,-1.504856
...,...,...
151,funniest,1.294789
64,underrated,1.327980
28,well worth,1.518508
83,definitely worth,1.640683


In [19]:
split_dirc

'D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\split_{}\\'

In [21]:
coef_df.to_csv('D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\coef.csv', index=False)