In [82]:
import pandas as pd

In [83]:
train = pd.read_csv('data/split_1/train.tsv', sep='\t')
test = pd.read_csv('data/split_1/test.tsv', sep='\t')
test_y = pd.read_csv('data/split_1/test_y.tsv', sep='\t')

train_label = train['sentiment']
train_review =  train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

test_label = test_y['sentiment']
test_review = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

In [84]:
train.head()

Unnamed: 0,id,sentiment,review
0,1,1,Naturally in a film who's main themes are of m...
1,4,0,Afraid of the Dark left me with the impression...
2,7,0,This has to be one of the biggest misfires eve...
3,8,0,"This is one of those movies I watched, and won..."
4,17,0,This movie was dreadful. Biblically very inacc...


In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,  # Converts all text to lowercase by default
    ngram_range=(1, 4),  # Extracts unigrams only by default
	preprocessor=lambda x: x.lower(),  # Convert to lowercase
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)
dtm_train = vectorizer.fit_transform(train['review'])

In [86]:
dtm_train.shape

(25000, 16224)

In [87]:
dtm_test = vectorizer.transform(test['review'])

In [88]:
# use lassocv to extract features
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import SelectFromModel

model = LogisticRegressionCV(max_iter=10000, cv=5, n_jobs=-1, penalty='l1', solver='liblinear')
model.fit(dtm_train, train_label)

In [89]:
coefficients = model.coef_.flatten()

feature_names = vectorizer.get_feature_names_out()
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Calculate the absolute values of coefficients for ranking
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()

# Sort the features by absolute coefficient values
sorted_features = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)
sorted_features.head(20)

Unnamed: 0,Feature,Coefficient,Absolute Coefficient
248,7 10,33.500606,33.500606
186,3 10,-28.176004,28.176004
212,4 10,-23.253202,23.253202
15630,waste,-22.838807,22.838807
16011,worst,-22.151248,22.151248
260,8 10,20.485692,20.485692
1106,awful,-18.402624,18.402624
7,1 10,-18.045434,18.045434
143,2 10,-17.823135,17.823135
11349,poorly,-15.159419,15.159419


In [94]:
# save top 1000 features to myvocab_1000.txt
# save top 2000 features to myvocab_2000.txt
# save top 3000 features to myvocab_3000.txt
sorted_features['Feature'].head(1000).to_csv('myvocab_1000.txt', index=False)
sorted_features['Feature'].head(2000).to_csv('myvocab_2000.txt', index=False)
sorted_features['Feature'].head(3000).to_csv('myvocab_3000.txt', index=False)


In [91]:
# from xgboost import XGBClassifier
# from sklearn.metrics import roc_auc_score
# params = {
# 	"learning_rate": 0.05,
# 	"n_estimators": 10000,
# 	"max_depth": 6,
# 	"subsample": 0.8,
# 	"device": "cuda",
# 	"tree_method": "hist",
# 	"use_label_encoder": False
# }
# model = XGBClassifier(**params)
# model.fit(dtm_train, train_label)

In [92]:
# val_probs = model.predict_proba(dtm_test)[:, 1]
# roc_auc_score(test_label, val_probs)

In [93]:
#