In [198]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [189]:
corpus = pd.read_csv('./datasets/final_clean.csv')
corpus.drop(columns=['Unnamed: 0'], inplace=True)

corpus.drop(columns=['subreddit'], inplace=True)

# 1 >>> Overwatch
# 0 >>> League of Legends 
X = corpus['text']
y = corpus['target']

In [190]:
# Split the data into the training and testing sets >>>
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y)

In [191]:
# pipe it >>>
# 1. CountVectorizer (transformer)
# 2. LogisticRegression (estimator)

pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'lbfgs'))
])

In [192]:
# Paramies >>>
pipe_params = {
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [193]:
# Instantiate GridSearchCV.
gs = GridSearchCV(pipe, 
                  pipe_params, 
                  cv = 5) 

In [194]:
# Fit GridSearch to training data.

gs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [195]:
# What's the best score?
gs.best_score_ 

0.8388059701492537

In [196]:
# Save best model as gs_model.
gs_model = gs.best_estimator_

# Score model on training set.
gs_model.score(X_train, y_train)

0.9779850746268657

In [197]:
# Score model on testing set.

gs_model.score(X_test, y_test) 

0.8742424242424243

In [175]:
# Extract best parameters:

gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 5000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1)}

In [176]:
# TF-IDF >>>

# Instantiate the transformer.
tvec = TfidfVectorizer()

In [177]:
df = pd.DataFrame(tvec.fit_transform(X_train).toarray(),
                  columns=tvec.get_feature_names())
df.head()

Unnamed: 0,00,000,0001,002mrs,004adk,00min,01,02,03,04,...,وش,وفاة,يصير,게임표절해놓고,기술력이,못하내,바쁘니,베끼기,없지,재대로만들지도
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [178]:
df.shape

(2680, 10002)

In [179]:
X_train = tvec.fit_transform(X_train)

X_test = tvec.transform(X_test)

In [180]:
# Instantiate logistic regression.
lr = LogisticRegression(solver = 'lbfgs')

# Fit logistic regression.
lr.fit(X_train, y_train)

# Evaluate logistic regression.
print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')

Training Score: 0.9649253731343284
Testing Score: 0.8560606060606061


In [181]:
# TF-IDF is better