In [207]:
import pandas as pd
import numpy as np
from scipy import stats
from joblib import Parallel, delayed

In [208]:
data = pd.DataFrame()
for i in range(1, 6):
    data = pd.concat([data, pd.read_csv(f'data/split_{i}/train.tsv', sep='\t')])
    test_review = pd.read_csv(f'data/split_{i}/test.tsv', sep='\t')
    test_review['sentiment'] = pd.read_csv(f'data/split_{i}/test_y.tsv', sep='\t')['sentiment']
    data = pd.concat([data, test_review])

In [209]:
data_label = data['sentiment']
data_review =  data['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

In [210]:
# download nltk stopwords
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')


[nltk_data] Downloading package stopwords to C:\Users\Yangliang
[nltk_data]     Lu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [211]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def vectorizer(min_df = None, max_df = None, ngram_range = (1, 2), vector_type = 'tfidf'):
    vector = None
    if vector_type == 'tfidf':
        vector = TfidfVectorizer()
    elif vector_type == 'count':
        vector = CountVectorizer()
    if min_df is not None:
        vector.set_params(min_df=min_df)
    if max_df is not None:
        vector.set_params(max_df=max_df)

    vector.set_params(stop_words=stopwords, ngram_range=ngram_range, preprocessor=lambda x: x.lower(), token_pattern=r"\b[\w+\|']+\b")


    return vector

In [212]:
vector_1 = vectorizer(min_df=0.0001, max_df=0.5, ngram_range=(1, 2))
labels = data['sentiment'].to_numpy()
dtm = vector_1.fit_transform(data['review'])
dtm_positive = dtm[labels == 1]
dtm_negative = dtm[labels == 0]

In [213]:
print(dtm_positive.shape)
print(dtm_negative.shape)
print(dtm.shape)

(125000, 182618)
(125000, 182618)
(250000, 182618)


In [214]:
def compute_t_test(feature_index, dtm_positive, dtm_negative):
    positive_feature_values = dtm_positive[:, feature_index].toarray().flatten()
    negative_feature_values = dtm_negative[:, feature_index].toarray().flatten()

    t_stat, p_value = stats.ttest_ind(positive_feature_values, negative_feature_values, equal_var=True)
    return t_stat, p_value

# Run the t-tests in parallel.
t_test_results = Parallel(n_jobs=-1)(delayed(compute_t_test)(i, dtm_positive, dtm_negative) for i in range(dtm.shape[1]))

# Extract t-statistics and p-values from the results.
t_stats, p_values = zip(*t_test_results)

In [215]:
# dense_positive = dtm_positive.toarray()
# dense_negative = dtm_negative.toarray()
#
# # Perform the t-tests across the feature axis.
# t_stats, p_values = stats.ttest_ind(dense_positive, dense_negative, axis=0, equal_var=False)


In [216]:
t_test_results = pd.DataFrame({
    'Feature': vector_1.get_feature_names_out(),
    't-statistic': t_stats,
    'p-value': p_values,
    'abs t-statistic': np.absolute(t_stats)
})

In [217]:
significant_features = t_test_results[t_test_results['p-value'] < 0.05]
significant_features = significant_features.sort_values(by='abs t-statistic', ascending=False)
significant_features.shape

(110679, 4)

In [218]:
neative_features = significant_features[significant_features['t-statistic'] < 0]
positive_features = significant_features[significant_features['t-statistic'] > 0]

In [219]:
print(neative_features.shape)
print(positive_features.shape)

(54743, 4)
(55936, 4)


In [220]:
# find features that are in both positive and negative features
common_features = pd.merge(neative_features, positive_features, on='Feature')
common_features.head()

Unnamed: 0,Feature,t-statistic_x,p-value_x,abs t-statistic_x,t-statistic_y,p-value_y,abs t-statistic_y


In [221]:
vector_2 = vectorizer(min_df=0.0001, max_df=0.5, ngram_range=(1, 2))
vector_2.fit(significant_features['Feature'].head(2000))
significant_dmt_train = vector_2.transform(data_review)
significant_dmt_train.shape

(250000, 2168)

In [222]:
# use lassocv to extract features
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import SelectFromModel

model = LogisticRegressionCV(max_iter=10000, cv=5, n_jobs=-1, penalty='l1', solver='liblinear')
model.fit(significant_dmt_train, data_label)

In [223]:
coefficients = model.coef_.flatten()
feature_names = vector_2.get_feature_names_out()
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
# Calculate the absolute values of coefficients for ranking
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
# Sort the features by absolute coefficient values
sorted_features = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)

sorted_features.head()

Unnamed: 0,Feature,Coefficient,Absolute Coefficient
1827,supposed comedy,-163.229624,163.229624
2116,worse acting,-157.592112,157.592112
1428,please waste,-96.880814,96.880814
987,instead 1,-75.386249,75.386249
767,forwarding,-68.635547,68.635547


In [224]:
sorted_features['Feature'].shape

(2168,)

In [225]:
train = pd.read_csv('data/split_1/train.tsv', sep='\t')
test = pd.read_csv('data/split_1/test.tsv', sep='\t')
test_y = pd.read_csv('data/split_1/test_y.tsv', sep='\t')

train_label = train['sentiment']
train_review =  train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

test_label = test_y['sentiment']
test_review = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

In [226]:
vector_3 = vectorizer(min_df=0.0001, max_df=0.5, ngram_range=(1, 2))
vector_3.fit(sorted_features['Feature'].head(1000))
dtm_train = vector_3.transform(train_review)
dtm_test = vector_3.transform(test_review)

In [227]:
model = LogisticRegressionCV(max_iter=10000, cv=5, n_jobs=-1, solver='liblinear')
model.fit(dtm_train, train_label)

In [228]:
from sklearn.metrics import roc_auc_score

pred = model.predict_proba(dtm_test)[:, 1]
roc_auc_score(test_label, pred)

0.9607600025218678

In [233]:
sorted_features['Feature'].shape

(2168,)

In [234]:
# save top 980 features
sorted_features['Feature'].head(1000).to_csv('myvocab.txt', sep='\t', index=False)

In [231]:
# from xgboost import XGBClassifier
# from sklearn.metrics import roc_auc_score
# params = {
# 	"learning_rate": 0.0001,
# 	"n_estimators": 10000,
# 	"max_depth": 7,
# 	"subsample": 0.8,
# 	"device": "cuda",
# 	"tree_method": "hist",
# 	"use_label_encoder": False
# }
# model = XGBClassifier(**params)
# model.fit(dtm_train, train_label)

In [232]:
# val_probs = model.predict_proba(dtm_test)[:, 1]
# roc_auc_score(test_label, val_probs)