In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy

In [2]:
from sklearn.metrics import confusion_matrix, precision_score, balanced_accuracy_score, accuracy_score, f1_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [3]:
# read in comments data
comments = pd.read_csv('../data/comments_clean.csv', index_col=0)
print(comments.shape)
comments.head()

(9652, 2)


Unnamed: 0,clean_body,target
0,use subtitles,1
1,"for neither ever, nor never",1
2,it reminds me of fringe too,1
3,"saw s01 in english, had to resort to subtitles...",1
4,"when season 2 was released, all the recaps i h...",1


In [4]:
# drop forgotten NAs
comments.isna().sum()

clean_body    17
target         0
dtype: int64

In [5]:
comments.dropna(inplace=True)
print(comments.shape)
comments.isna().sum()

(9635, 2)


clean_body    0
target        0
dtype: int64

In [6]:
# Set X and y
X = comments['clean_body']
y = comments['target']

In [7]:
# tts
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=23)

In [8]:
# baseline
# 0: fantasy
# 1: scifi 
y_train.value_counts(normalize=True)

target
0    0.500969
1    0.499031
Name: proportion, dtype: float64

In [18]:
# lemmatize function with spacy
# nlp = spacy.load("en_core_web_sm")

# def lemmatizer(sentence):
#     doc = nlp(sentence)
#     lemmatized_tokens = [token.lemma_ for token in doc]
#     return ' '.join(lemmatized_tokens)

# comments['clean_body'] = comments['clean_body'].apply(lemmatizer)

-----
## Multinomial Naive Bias with GridSearch 

#### Model

In [19]:
nb_pipe = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)

nb_params = {
    'tfidfvectorizer__max_features': [500, 1000, 2000, 5000, 7000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)],
    'tfidfvectorizer__stop_words': ['english'],
    'tfidfvectorizer__analyzer': ['word'],
    # 'tfidfvectorizer__tokenizer': [lemmatizer]
}

In [20]:
# pass pipe and params to gridsearch
nb_gs = GridSearchCV(nb_pipe, nb_params, n_jobs=-1)
nb_gs.fit(X_train, y_train)
print('TRAIN', nb_gs.score(X_train, y_train))
print('TEST', nb_gs.score(X_test, y_test))
print('BEST PARAMS', nb_gs.best_params_)

TRAIN 0.8852753944090783
TEST 0.779991697799917
BEST PARAMS {'tfidfvectorizer__analyzer': 'word', 'tfidfvectorizer__max_features': 7000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [21]:
preds_nb = nb_gs.predict(X_test)
preds_nb

array([0, 0, 1, ..., 0, 0, 1])

#### Classification metrics

In [22]:
# metrics
print('RECALL:', recall_score(y_test, preds_nb))
print('PRECISION:', precision_score(y_test, preds_nb))
print('F1:', f1_score(y_test, preds_nb))

RECALL: 0.7204658901830283
PRECISION: 0.8169811320754717
F1: 0.7656940760389036


#### Coefficients

In [23]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([1207, 1202]))

In [24]:
# coef baseline
2402/(2402 + 2411)

0.4990650322044463

In [25]:
#look at coefs
coefs_nb = nb_gs.best_estimator_['multinomialnb'].feature_log_prob_
coefs_nb

array([[-8.62328579, -7.90923935, -8.14257847, ..., -9.87532598,
        -9.48351322, -9.27877253],
       [-8.45371989, -7.96662874, -7.54838496, ..., -9.47107963,
        -9.07271777, -8.49644555]])

----
## Logistic Regression

#### Model

In [27]:
lg_pipe = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression()
)

lg_params = {
    'tfidfvectorizer__max_features': [500, 1000, 2000, 5000, 7000],
    'tfidfvectorizer__ngram_range': [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)],
    'tfidfvectorizer__stop_words': ['english'],
    'logisticregression__max_iter': [100, 1000],
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__l1_ratio': [0.01, 0.1, 1],
    # 'tfidfvectorizer__tokenizer': [lemmatizer]
}

In [28]:
# pass pipe and params to gridsearch
lg_gs = GridSearchCV(lg_pipe, lg_params, n_jobs=-1)
lg_gs.fit(X_train, y_train)
print('TRAIN', lg_gs.score(X_train, y_train))
print('TEST', lg_gs.score(X_test, y_test))
print('BEST PARAMS', lg_gs.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

TRAIN 0.8840298920564628
TEST 0.7625570776255708
BEST PARAMS {'logisticregression__C': 1, 'logisticregression__l1_ratio': 0.01, 'logisticregression__max_iter': 100, 'tfidfvectorizer__max_features': 7000, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__stop_words': 'english'}


In [29]:
preds_lg = lg_gs.predict(X_test)
preds_lg

array([1, 0, 1, ..., 0, 0, 1])

#### Classification metrics

In [30]:
# metrics
print('RECALL:', recall_score(y_test, preds_lg))
print('PRECISION:', precision_score(y_test, preds_lg))
print('F1:', f1_score(y_test, preds_lg))

RECALL: 0.7396006655574043
PRECISION: 0.774390243902439
F1: 0.7565957446808511


#### Coefficients

In [31]:
#look at coefs
coefs_lg = lg_gs.best_estimator_['logisticregression'].coef_
coefs_lg

array([[ 0.27150233, -0.06052343,  0.08805566, ...,  0.76319591,
         0.10023167,  0.18127272]])

----

In [33]:
#put coefs in dataframe with words
coeflg_df = pd.DataFrame({
    'coefs': coefs_lg[0], 
    'features': lg_gs.best_estimator_['tfidfvectorizer'].get_feature_names_out()
})
coeflg_df.head()

Unnamed: 0,coefs,features
0,0.271502,000
1,-0.060523,000 men
2,0.088056,000 years
3,-0.003156,10
4,-0.197704,10 books


In [34]:
#examine ten largest
coeflg_df.nlargest(10, 'coefs')

Unnamed: 0,coefs,features
6421,4.106674,trek
216,3.634692,alien
224,3.555121,aliens
4524,3.207748,picard
2001,3.036064,episode
5394,3.02443,scifi
5775,2.949026,space
5855,2.943446,star
1808,2.526606,ds9
2088,2.442821,expanse


In [35]:
#ten smallest
coeflg_df.nsmallest(10, 'coefs')

Unnamed: 0,coefs,features
2167,-4.626031,fantasy
684,-2.869749,book
2716,-2.59709,grimdark
1777,-2.150264,dragons
3775,-2.112112,magic
3896,-2.10658,medieval
6091,-1.833956,sword
1017,-1.73826,characters
440,-1.72283,author
451,-1.72088,authors
