In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
# from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("myvocab_R.txt") as f:
  vocab = f.readlines()
vocab = [v[:-1].replace("_"," ") for v in vocab]

In [3]:
i = 1
folderName = 'split_' + str(i)
train_filename = folderName + '/' + 'train.tsv'
test_filename = folderName + '/' + 'test.tsv'
test_y_filename = folderName + '/' + 'test_y.tsv'

train_data = pd.read_csv(train_filename,sep='\t', header=0)
train_y = train_data['sentiment']
train_features = train_data.copy()
train_features = train_features.drop(['sentiment'],axis=1)

test_data = pd.read_csv(test_filename,sep='\t', header=0)
test_features = test_data['review']
    # print(test_data.head(5))
    # print(test_data.shape)
test_y_data = pd.read_csv(test_y_filename,sep='\t', header=0)

In [4]:
def clean_text(text):     
    return text.strip().lower()

In [5]:
vectorizer = CountVectorizer(ngram_range=(1, 4),vocabulary=vocab) 
#tfvectorizer = TfidfVectorizer(ngram_range=(1, 4))
classifier = LogisticRegression(penalty='l1', solver='liblinear',C=0.14)
#classifier = LogisticRegression(penalty='l1')
LRmodel = Pipeline([('vectorizer', vectorizer),
                      ('classifier', classifier)])

test_Y = test_y_data['sentiment']
LRmodel.fit(train_features['review'],train_y)
LRpred = LRmodel.predict_proba(test_features)
LRpred = LRpred[:,1]
result = LRmodel.predict(test_features)
print(f'Accuracy: {accuracy_score(test_Y,result)*100}%')
auc = roc_auc_score(test_Y, LRpred,average='micro')
print(f'AUC: {auc*100}%')
test = LRmodel['vectorizer'].get_feature_names_out()
new_vocab = test[LRmodel['classifier'].coef_[0]!=0]

Accuracy: 89.19200000000001%
AUC: 95.61777027426898%


In [6]:
new_vocab[:5]

array(['low rating', 'rodriguez', 'spinal tap', 'of justice', 'burke'],
      dtype=object)

In [10]:
res = []
for words in new_vocab:
    words = words.replace(' ','_')
    res.append(words)
    print('words:',words)
res = np.array(res)

words: low_rating
words: rodriguez
words: spinal_tap
words: of_justice
words: burke
words: choir
words: elmer
words: penalty
words: segal
words: jimmy_stewart
words: novak
words: lucille
words: spade
words: of_london
words: bela_lugosi
words: hepburn
words: few_laughs
words: sources
words: devito
words: gorilla
words: rainy
words: for_change
words: ladder
words: been_fan_of
words: cycle
words: gandhi
words: mystery_science
words: tunnel
words: much_too
words: only_problem
words: two_stars
words: south_park
words: cope
words: swim
words: rod
words: yawn
words: jackass
words: olds
words: polished
words: be_missed
words: having_sex
words: bravo
words: kitty
words: berlin
words: last_scene
words: not_enough_to
words: few_good
words: tiresome
words: for_once
words: smiling
words: garfield
words: not_very_good
words: mason
words: can_never
words: ape
words: baldwin
words: based_on_true_story
words: scary_movie
words: stilted
words: stock_footage
words: very_disappointed
words: boll
words: ba

In [11]:
np.savetxt("final_vocab.txt",res.astype(str),fmt='%s') 