In [1]:
from sklearn.preprocessing import StandardScaler

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb 
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, accuracy_score
import re 
import time

In [3]:
def get_vectorizer(vocab_file):
    corpus = None
    with open(vocab_file, encoding='UTF-8') as f:
        corpus = f.readlines() 
    corpus = [w.strip() for w in corpus]
    voca = dict()
    for idx, term in enumerate(corpus):
        voca[term] = idx

    vectorizer = CountVectorizer(vocabulary=voca, ngram_range=(1, 10))
    return vectorizer


In [4]:
split_dirc = 'D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\split_{}\\'
cleaner = re.compile('<.*?>') 

In [5]:
vocab_file = r'D:\studying\UIUC courses\STAT542\project\Project3\splits\vocab_lasso_10_999.txt'
# vocab_file = 'D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\vocab_lasso_840.txt'
vectorizer = get_vectorizer(vocab_file)

In [6]:
len(vectorizer.get_feature_names())

999

# tunning

In [10]:
i  = 1

In [11]:
dirc = split_dirc.format(i)

train_file = dirc + 'train.tsv'
test_file = dirc + 'test.tsv'
test_y_file = dirc + 'test_y.tsv'

train = pd.read_csv(train_file, sep='\t', encoding='utf-8')
train['review'] = train['review'].map(lambda s: re.sub(cleaner, '', s))

test = pd.read_csv(test_file, sep='\t', encoding='utf-8')
test['review'] = test['review'].map(lambda s: re.sub(cleaner, '', s))

test_y = pd.read_csv(test_y_file, sep='\t', encoding='utf-8')

X_train = vectorizer.transform(train['review'].values).toarray()
# X_train = pd.DataFrame(X_train, columns=vectorizer.get_feature_names())
Y_train = train['sentiment']

X_test = vectorizer.transform(test['review'].values).toarray()
# X_test = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())

In [14]:
scaler = StandardScaler() 
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=vectorizer.get_feature_names())
X_test = pd.DataFrame(scaler.transform(X_test), columns=vectorizer.get_feature_names())

true_y = test_y['sentiment'].values

In [15]:
for c in np.linspace(0.0001, 0.001, 10):
    c = np.round(c, 4)
    print(c, end=' ')
    ridge = LogisticRegression(C=c, random_state=2021, max_iter=1000, penalty='l2', solver='liblinear')
    ridge.fit(X_train, Y_train)

    pred_test = ridge.predict_proba(X_test)

    auc = roc_auc_score(true_y, pred_test[:,1])
    print('auc:', auc)

0.0001 auc: 0.9598295085805553
0.0002 auc: 0.9611261226300377
0.0003 auc: 0.9615697278997224
0.0004 auc: 0.9617467655257093
0.0005 auc: 0.9618114634275639
0.0006 auc: 0.9618199242670388
0.0007 auc: 0.9618015945815197
0.0008 auc: 0.9617628744008669
0.0009 auc: 0.9617145925756031
0.001 auc: 0.9616640323397093


In [None]:
ridge = LogisticRegression(C=0.5, random_state=2021, max_iter=1000)
ridge.fit(X_train, Y_train)

true_y = test_y['sentiment'].values

pred_test = ridge.predict_proba(X_test)

auc = roc_auc_score(true_y, pred_test[:,1])
print('auc:', auc)

In [9]:
for i in range(1, 6):
    start = time.time()
    print('split: ' + str(i))
    dirc = split_dirc.format(i)

    train_file = dirc + 'train.tsv'
    test_file = dirc + 'test.tsv'
    test_y_file = dirc + 'test_y.tsv'
    
    train = pd.read_csv(train_file, sep='\t', encoding='utf-8')
    train['review'] = train['review'].map(lambda s: re.sub(cleaner, '', s))

    test = pd.read_csv(test_file, sep='\t', encoding='utf-8')
    test['review'] = test['review'].map(lambda s: re.sub(cleaner, '', s))

    test_y = pd.read_csv(test_y_file, sep='\t', encoding='utf-8')

    X_train = vectorizer.transform(train['review'].values).toarray()
    # X_train = pd.DataFrame(X_train, columns=vectorizer.get_feature_names())
    Y_train = train['sentiment']

    X_test = vectorizer.transform(test['review'].values).toarray()
    # X_test = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())
    
    scaler = StandardScaler() 
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=vectorizer.get_feature_names())
    X_test = pd.DataFrame(scaler.transform(X_test), columns=vectorizer.get_feature_names())

    ridge = LogisticRegression(C=0.5, random_state=2021, max_iter=1000)
    ridge.fit(X_train, Y_train)

    true_y = test_y['sentiment'].values

    pred_test = ridge.predict_proba(X_test)

    auc = roc_auc_score(true_y, pred_test[:,1])
    end = time.time()
    print('auc:', auc)
    print('running time:', end - start)

split: 1
auc: 0.9588617088651888
running time: 73.72855758666992
split: 2
auc: 0.9586232726109628
running time: 73.30441880226135
split: 3
auc: 0.9579323132304199
running time: 73.77296924591064
split: 4
auc: 0.958935231318548
running time: 74.2836103439331
split: 5
auc: 0.9579574162223122
running time: 74.13735008239746


# 5 splits

In [7]:
all_auc = []
all_time = []

In [8]:
for i in range(1, 6):
    start = time.time()
    print('split: ' + str(i))
    dirc = split_dirc.format(i)

    train_file = dirc + 'train.tsv'
    test_file = dirc + 'test.tsv'
    test_y_file = dirc + 'test_y.tsv'
    
    train = pd.read_csv(train_file, sep='\t', encoding='utf-8')
    train['review'] = train['review'].map(lambda s: re.sub(cleaner, '', s))

    test = pd.read_csv(test_file, sep='\t', encoding='utf-8')
    test['review'] = test['review'].map(lambda s: re.sub(cleaner, '', s))
    pred_id = test['id']

    test_y = pd.read_csv(test_y_file, sep='\t', encoding='utf-8')

    X_train = vectorizer.transform(train['review'].values).toarray()
    # X_train = pd.DataFrame(X_train, columns=vectorizer.get_feature_names())
    Y_train = train['sentiment']

    X_test = vectorizer.transform(test['review'].values).toarray()
    # X_test = pd.DataFrame(X_test, columns=vectorizer.get_feature_names())
    
    scaler = StandardScaler() 
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=vectorizer.get_feature_names())
    X_test = pd.DataFrame(scaler.transform(X_test), columns=vectorizer.get_feature_names())

    ridge = LogisticRegression(C=0.0007, random_state=2021, max_iter=1000)
    ridge.fit(X_train, Y_train)

    true_y = test_y['sentiment'].values

    pred_test = ridge.predict_proba(X_test)

    auc = roc_auc_score(true_y, pred_test[:,1])
    end = time.time()
    t = np.round(end - start, 2)
    all_auc.append(auc)
    all_time.append(t)
    print('auc:', auc)
    print('running time:', t)
    sub = dirc + 'mysubmission.txt'
    mysubmission = pd.DataFrame({'id': pred_id, 'prob': pred_test[:, 1]})
    mysubmission.to_csv(sub, sep='\t', index=False)
    

split: 1
auc: 0.9618845965687738
running time: 69.95
split: 2
auc: 0.9606171850221539
running time: 70.08
split: 3
auc: 0.9607445590221857
running time: 70.02
split: 4
auc: 0.9608431941396443
running time: 70.06
split: 5
auc: 0.9601191260251195
running time: 70.06


In [9]:
all_auc

[0.9618845965687738,
 0.9606171850221539,
 0.9607445590221857,
 0.9608431941396443,
 0.9601191260251195]

In [10]:
all_time

[69.95, 70.08, 70.02, 70.06, 70.06]

In [None]:
ridge.coef_.reshape(-1)

In [None]:
vectorizer.get_feature_names() 

In [12]:
coef_df = pd.DataFrame({'feature': vectorizer.get_feature_names(), 'coef': ridge.coef_.reshape(-1)})

In [13]:
coef_df.sort_values(by=['coef'])

Unnamed: 0,feature,coef
422,worst,-0.197746
424,awful,-0.174627
421,bad,-0.148358
423,waste,-0.142396
432,poor,-0.134137
...,...,...
2,wonderful,0.107991
5,perfect,0.119839
1,excellent,0.151538
3,best,0.160606


In [14]:
split_dirc

'D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\split_{}\\'

In [15]:
coef_df.to_csv('D:\\studying\\UIUC courses\\STAT542\\project\\Project3\\splits\\coef.csv', index=False)