In [None]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import re 

In [None]:
def vars(a, axis=None):
    """ Variance of sparse matrix a
    var = mean(a**2) - mean(a)**2
    """
    a_squared = a.copy()
    a_squared.data **= 2
    return a_squared.mean(axis) - np.square(a.mean(axis))

# Load the whole data

I construct my vocabulary using the whole data `alldata.tsv`. Html tags are removed.

In [None]:
alldata = pd.read_csv('./alldata.tsv', sep='\t', encoding='utf-8')
cleaner = re.compile('<.*?>')
alldata['review'] = alldata['review'].map(lambda s: re.sub(cleaner, '', s))

# Construct DT matrix

I use `sklearn.feature_extraction.text.CountVectorizer()` to get the matrix of token counts.

In [None]:
stop_words = ["i", "me", "my", "myself", 
               "we", "our", "ours", "ourselves", 
               "you", "your", "yours", 
               "their", "they", "his", "her", 
               "she", "he", "a", "an", "and",
               "is", "was", "are", "were", 
               "him", "himself", "has", "have", 
               "it", "its", "the", "us"]

vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1, 8), min_df=0.001, max_df=0.5)
corpus = alldata['review'].values
X = vectorizer.fit_transform(corpus)

# Pick top 2500 words

Follow Prof. Liang's suggestion. I calculate t-statistics of every word. Then I pick top 2500 words with largest absolute t-statistics.

In [None]:
indi = (alldata['sentiment'] == 1)
size = X.shape[1]
summ = np.zeros((size, 4))
# mean_1
summ[:,0] = np.array(X[indi].mean(axis=0)).reshape(-1)
# var_1 
summ[:,1] = np.array(vars(X[indi], axis=0)).reshape(-1)
# mean_2
summ[:,2] = np.array(X[~indi].mean(axis=0)).reshape(-1)
# var_2
summ[:,3] = np.array(vars(X[~indi], axis=0)).reshape(-1)

n1 = sum(alldata['sentiment'])
n2 = len(alldata) - n1
tstat = (summ[:,0] - summ[:,2]) / np.sqrt(summ[:,1] / n1 + summ[:,3] / n2)

idx = np.argsort(-np.abs(tstat))[:2500]
pos_idx = idx[tstat[idx] >= 0]
neg_idx = idx[tstat[idx] < 0]
words = np.array(vectorizer.get_feature_names(), dtype='str')
pos_words = words[pos_idx]
neg_words = words[neg_idx]
vocab_2500 = pos_words.tolist() + neg_words.tolist() 

# Reduce the vocab size to 1000

I use logistic regression with l1 penalty `C=0.0955` to reduce the vocab size to 999.  
Then I save my vocabulary to `myvocab.txt`

In [None]:
voca = dict()
for idx, term in enumerate(vocab_2500):
    voca[term] = idx
vectorizer = CountVectorizer(vocabulary=voca, ngram_range=(1, 10))

X = vectorizer.transform(alldata['review'].values).toarray()
X_train = pd.DataFrame(X, columns=vectorizer.get_feature_names())
Y_train = alldata['sentiment']

lasso = LogisticRegression(penalty='l1', C=0.0955, solver='liblinear', random_state=2021)
lasso.fit(X_train, Y_train)
coef = lasso.coef_.reshape(-1)
features = np.array(vectorizer.get_feature_names())

with open('./myvocab.txt', mode='w', encoding='UTF-8') as f:
    for w in features[coef != 0].tolist():
        f.write(w + '\n')