### Modelling

In [18]:
 #!conda install -c conda-forge imbalanced-learn --yes

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,  roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')




In [20]:
combined = pd.read_csv('datasets/combined.csv')
combined.shape

(59129, 12)

In [21]:
# select X and y columns we need
df =  combined[['Subreddit', 'preprocessed_words']]

# add label for classification
df['is_amd'] = df['Subreddit'].apply(lambda x: 1 if x == "AMD" else 0)
df = df.drop(columns = 'Subreddit')
df = df.rename(columns={'preprocessed_words':'text'})

In [22]:
df.head()

Unnamed: 0,text,is_amd
0,keeps its price from the vram it has a is a ta...,0
1,it s not you just a combo of the game running ...,0
2,starfield is cpu heavy which one do you have a...,0
3,the game is just awfully optimized i just upgr...,0
4,i tried using fsr but i did nt see any noticib...,0


In [23]:
# Specify Stopwords
custom_stopwords = [ "subreddit", "reddit"]  # remove these words as it is not meaningful for our analysis
stopwords_list = list(set(stopwords.words('english') + custom_stopwords))

For this notebook on Modelling, I will only be considering the "text" column of the scraped dataset, and this has been pre-processed in notebook 2. This ensures that our model can be properly trained on the content of the subreddit posts.


##### Baseline 
We always begin with creating a baseline model.

In [24]:
# Baseline model
X = df['text']
y = df['is_amd']
y.value_counts(normalize=True)

is_amd
1    0.634545
0    0.365455
Name: proportion, dtype: float64

#### Model Preparation


Steps I took for this section:
1. Train Test Split
2. Instantiating Vectorizers and Models
3. Creating a User Define Function* with Scikit-learn's Pipeline tool that will help calculate the relevant classification metrics from each model (Metrics include Accuracy, Specificity and F1_Score)
4. Evaluate best model
5. Tune Hyper-parameters of best model



In [25]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)
X_train = X_train.values.astype('U')
X_test = X_test.values.astype('U')
print(X_train.shape)
print(X_test.shape)

(47303,)
(11826,)


In [26]:
cvec = CountVectorizer(stop_words=stopwords_list)
cvec.fit(X_train)
X_train = cvec.transform(X_train) #transform the corpus

In [27]:
print(cvec.get_feature_names_out())
print(X_train.shape)

['aa' 'aaa' 'aaaaa' ... 'zx' 'zz' 'zzx']
(47303, 32124)


In [28]:
# Transform test
X_test = cvec.transform(X_test)

##### 1. Train Test Split

In [29]:
# Redefine train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)
X_train = X_train.values.astype('U')
X_test = X_test.values.astype('U')

From the Baseline model, we can see moderate imbalance in our dataset (65%-35%). Hence, we can use the SMOTE technique to correct this.

In [30]:
X_train
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords_list)

# Transform the text data into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(X_train)
tfidf_matrix

<47303x32124 sparse matrix of type '<class 'numpy.float64'>'
	with 842863 stored elements in Compressed Sparse Row format>

In [31]:
X_train
# Initialize the cvec vectorizer
cvec_vectorizer = CountVectorizer(stop_words=stopwords_list)

# Transform the text data into a cvec matrix
cvec_matrix = cvec_vectorizer.fit_transform(X_train)
cvec_matrix

<47303x32124 sparse matrix of type '<class 'numpy.int64'>'
	with 842863 stored elements in Compressed Sparse Row format>

In [32]:
##Now we can create synthetic data for our training set

sm = SMOTE(sampling_strategy='auto', random_state=42)
Xsm_train_t, ysm_train_t = sm.fit_resample(tfidf_matrix, y_train)
# Xsm_train, ysm_train = sm.fit_resample(X_train_try, y_train)

In [33]:
##Now we can create synthetic data for our training set

sm = SMOTE(sampling_strategy='auto', random_state=42)
Xsm_train_c, ysm_train_c = sm.fit_resample(cvec_matrix, y_train)
# Xsm_train, ysm_train = sm.fit_resample(X_train_try, y_train)

In [34]:
 # ysm_train.value_counts(normalize=True)

In [35]:
# Xsm_train.shape[1]

In [36]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_tfidf

<11826x32124 sparse matrix of type '<class 'numpy.float64'>'
	with 203885 stored elements in Compressed Sparse Row format>

In [37]:
X_test_cvec = cvec_vectorizer.transform(X_test)
X_test_cvec

<11826x32124 sparse matrix of type '<class 'numpy.int64'>'
	with 203885 stored elements in Compressed Sparse Row format>

In [38]:
# # Create a Random Forest classifier
# clf = RandomForestClassifier(random_state=42)

# # Fit the classifier to your sparse training data
# clf.fit(Xsm_train, ysm_train)

In [39]:
# y_pred = clf.predict(X_test_tfidf)
# y_pred

In [40]:
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# print("rf")
# # Calculate and print accuracy for the test set
# accuracy_test = accuracy_score(y_test, y_pred)
# print("Test Accuracy:", accuracy_test)

# # Calculate and print specificity (true negative rate) for the test set
# confusion_matrix_test = confusion_matrix(y_test, y_pred)
# tn, fp, fn, tp = confusion_matrix_test.ravel()
# specificity = tn / (tn + fp)
# print("Specificity:", specificity)

# # Calculate and print F1 score for the test set
# f1_score_test = classification_report(y_test, y_pred, target_names=['class 0', 'class 1'], output_dict=True)['class 1']['f1-score']
# print("F1 Score:", f1_score_test)

# # Calculate and print ROC AUC score for the test set
# roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test_tfidf)[:, 1])
# print("ROC AUC Score:", roc_auc)

##### 2. Instantiation of Vectorizers and Models

I will be exploring different Classification algorithms and using both Count Vectorizer or Term Frequency-Inverse Document Frequency (TFIDF) transformers:
- Count Vectorizer: Takes every word as a token, and uses it as a feature.
- TFIFD: accounts for frequency of a word in a given document and the frequency between documents. Word importance increases proportionally to the number of times it appears in a document, but is offset by frequency of word in entire corpus.


In [41]:
# Instantiate Vectorizers
vectorizers = {'cvec': CountVectorizer(stop_words=stopwords_list),
               'tvec': TfidfVectorizer(stop_words=stopwords_list)
}

In [42]:
# Instiantiate models
models = {'nb': MultinomialNB(),
          'log_reg': LogisticRegression(max_iter=500, random_state=123),
          'rf': RandomForestClassifier(random_state=123),
          'knn': KNeighborsClassifier()}

##### 3. User Define Function - inputs required are vectorizer and model

In [49]:
df_smote_results = []

def clf_model(vec, mod, cv_num, X_train, y_train):   # option to include Grid Search
    
    results = {}
    
    pipe = Pipeline([
            (mod, models[mod])
            ])
    
    pipe.fit(X_train, y_train)
    
    print(vec)
    if vec == 'tvec':
        # Get predictions
        preds = pipe.predict(X_test_tfidf)
        
    else:
        # Get predictions
        preds = pipe.predict(X_test_cvec)

    # Confusion Matrix
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    cm = confusion_matrix(y_test, preds)
    tn, fp, fn, tp = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
    acc = (tp + tn)/ (tp+tn+fp+fn)
    spec = tn / (tn + fp)
    
    # Retrieve metrics
    results['Model'] = mod
    results['Vectorizer'] = vec
    results['Train Score'] = pipe.score(X_train, y_train)

    if vec=='tvec':
        results['Test Score'] = pipe.score(X_test_tfidf, y_test)

    else:
        results['Test Score'] = pipe.score(X_test_cvec, y_test)
        
    results['Accuracy'] = acc
    results['Specificity'] = spec
    results['f_score'] = f1_score(y_test, preds)
#     nb.predict_proba(X_test_tfidf)[:, 1]
    results['ROC_AUC'] = roc_auc_score(y_test, preds)

    df_smote_results.append(results)
    
    print(f"--- METRICS for {mod},{vec} ---")
    display(results)
    
    return pipe

In [50]:
# Multinomial Naive Bayes
cvec_nb = clf_model('cvec', 'nb', 5,Xsm_train_c,ysm_train_c)
tvec_nb = clf_model('tvec', 'nb', 5, Xsm_train_t, ysm_train_t)

cvec
--- METRICS for nb,cvec ---


{'Model': 'nb',
 'Vectorizer': 'cvec',
 'Train Score': 0.7912280117270789,
 'Test Score': 0.7168949771689498,
 'Accuracy': 0.7168949771689498,
 'Specificity': 0.7117075428042573,
 'f_score': 0.7634256642170717,
 'ROC_AUC': 0.7157951360076723}

tvec
--- METRICS for nb,tvec ---


{'Model': 'nb',
 'Vectorizer': 'tvec',
 'Train Score': 0.7987406716417911,
 'Test Score': 0.7241670894638931,
 'Accuracy': 0.7241670894638931,
 'Specificity': 0.6672836649699213,
 'f_score': 0.7769115032143346,
 'ROC_AUC': 0.7121066512482868}

In [45]:
Xsm_train.shape

NameError: name 'Xsm_train' is not defined

In [51]:
# Logistic Regression
cvec_lr = clf_model('cvec', 'log_reg', 5,Xsm_train_c,ysm_train_c)
tvec_lr = clf_model('tvec', 'log_reg', 5, Xsm_train_t, ysm_train_t)


cvec
--- METRICS for log_reg,cvec ---


{'Model': 'log_reg',
 'Vectorizer': 'cvec',
 'Train Score': 0.8447827825159915,
 'Test Score': 0.6932183324877389,
 'Accuracy': 0.6932183324877389,
 'Specificity': 0.7265155020823693,
 'f_score': 0.7360302677532015,
 'ROC_AUC': 0.7002780069047241}

tvec
--- METRICS for log_reg,tvec ---


{'Model': 'log_reg',
 'Vectorizer': 'tvec',
 'Train Score': 0.8107509328358209,
 'Test Score': 0.7098765432098766,
 'Accuracy': 0.7098765432098766,
 'Specificity': 0.7024525682554373,
 'f_score': 0.7575093646194078,
 'ROC_AUC': 0.7083025101405117}

In [52]:
# Logistic Regression using SMOTE data
cvec_lr = clf_model('cvec', 'rf', 5, Xsm_train_c, ysm_train_c)
tvec_lr = clf_model('tvec', 'rf', 5,Xsm_train_t, ysm_train_t)


cvec
--- METRICS for rf,cvec ---


{'Model': 'rf',
 'Vectorizer': 'cvec',
 'Train Score': 0.9873234275053305,
 'Test Score': 0.6727549467275494,
 'Accuracy': 0.6727549467275494,
 'Specificity': 0.5920869967607589,
 'f_score': 0.736088379705401,
 'ROC_AUC': 0.6556517073356034}

tvec
--- METRICS for rf,tvec ---


{'Model': 'rf',
 'Vectorizer': 'tvec',
 'Train Score': 0.9905050639658849,
 'Test Score': 0.7106375782174869,
 'Accuracy': 0.7106375782174869,
 'Specificity': 0.5918556223970384,
 'f_score': 0.7735874024083632,
 'ROC_AUC': 0.6854533975524638}

In [53]:
# KNN using SMOTE data
cvec_knn = clf_model('cvec', 'knn', 5, Xsm_train_c, ysm_train_c)
tvec_knn = clf_model('tvec', 'knn', 5, Xsm_train_t, ysm_train_t)
pd.DataFrame(df_smote_results)

cvec


KeyboardInterrupt: 

#### 2.2 Hyperparameter Tuning of Models

To train a robust machine learning model, we need to select the correct combination of hyperparameters.
Recall that in the user defined function created earlier, if "Gridsearch" = True, the function will perform a gridsearch to find the optimal hyperparameters that will give the best score. GridsearchCV searches all combinations of paramters in for a model that will give the best peformance score. It is not practically feasible to run a GridSearchCV on all models due to the complexity and hence time taken. Hence, I will select only the best 3 models to perform hyperparameter tuning.

From the above results table, seems like the Multinomial Naive Bayes Model, Logistic Regression and Random Forest perform the best.

- Multinomial Naive Bayes: Based on Bayes's theorem - the assumption that each feature (in our case, each word) is independent of each other.
- Logistic Regression:
- Random Forest: Consists of n number of decision trees that act as an ensemble. Each decision tree makes a class prediction and the class with the most votes becomes the model's prediction.    

In [None]:
tuning_results = []

# Vectorizer Parameters

cvec_params = {
    'cvec__max_features': [None],
    'cvec__min_df':[3, 4, 5],
    'cvec__max_df': [0.2, 0.3, 0.4],
    'cvec__stop_words': [stopwords_list],
    'cvec__ngram_range':[(1,1), (1,2)]
}


tvec_params = {
    'tvec__max_features': [None],
    'tvec__min_df':[3, 4, 5],
    'tvec__max_df': [0.2, 0.3, 0.4],
    'tvec__stop_words': [stopwords_list],
    'tvec__ngram_range':[(1,1), (1,2)]
}

rf_pipe_cvec_params = {
    'cvec__max_features': [100],
    'cvec__max_df': [0.2, 0.3],
    'cvec__min_df': [1, 2, 3],
    'cvec__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': [50,75,100]}


rf_pipe_tvec_params = {
    'tvec__max_features': [100],
    'tvec__max_df': [0.2, 0.3],
    'tvec__min_df': [1, 2, 3],
    'tvec__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': [50,75,100]}




In [None]:
# Tune for Logistic Regression
cvec_lr_gs = clf_model('cvec', 'log_reg', 3, vec_params=cvec_params, grid_search=True)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
--- Best Parameters for log_reg,cvec ---


{'cvec__max_df': 0.3,
 'cvec__max_features': None,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': ['of',
  'o',
  'if',
  'most',
  'as',
  'into',
  'yourselves',
  'any',
  "weren't",
  'ours',
  'itself',
  'that',
  'our',
  'doesn',
  "aren't",
  'd',
  'having',
  'out',
  "shan't",
  'both',
  "you're",
  'other',
  'here',
  "shouldn't",
  'all',
  'she',
  'aren',
  'off',
  'herself',
  'yourself',
  'should',
  'll',
  'were',
  "needn't",
  'how',
  'a',
  'about',
  'hasn',
  "haven't",
  'whom',
  "she's",
  'nor',
  'wasn',
  'ma',
  'now',
  'by',
  'them',
  "don't",
  'y',
  'what',
  "doesn't",
  'can',
  'they',
  'being',
  'not',
  'been',
  'no',
  'which',
  'than',
  "won't",
  'mightn',
  'its',
  'during',
  'between',
  'why',
  'because',
  'shouldn',
  'before',
  'or',
  'the',
  "mightn't",
  'against',
  'don',
  "couldn't",
  'hadn',
  'very',
  'does',
  'your',
  'have',
  'but',
  'down',
  'couldn',
  'subreddit',
  'on',
  

--- METRICS for log_reg,cvec ---


{'Model': 'log_reg',
 'Vectorizer': 'cvec',
 'Train Score': 0.9349724118977655,
 'Test Score': 0.7404870624048706,
 'Accuracy': 0.7404870624048706,
 'Specificity': 0.5552984729291994,
 'f_score': 0.8055502756130014,
 'ROC_AUC': 0.7012233302812308}

Fitting 5 folds for each of 18 candidates, totalling 90 fits
--- Best Parameters for log_reg,tvec ---


{'tvec__max_df': 0.2,
 'tvec__max_features': None,
 'tvec__min_df': 3,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': ['of',
  'o',
  'if',
  'most',
  'as',
  'into',
  'yourselves',
  'any',
  "weren't",
  'ours',
  'itself',
  'that',
  'our',
  'doesn',
  "aren't",
  'd',
  'having',
  'out',
  "shan't",
  'both',
  "you're",
  'other',
  'here',
  "shouldn't",
  'all',
  'she',
  'aren',
  'off',
  'herself',
  'yourself',
  'should',
  'll',
  'were',
  "needn't",
  'how',
  'a',
  'about',
  'hasn',
  "haven't",
  'whom',
  "she's",
  'nor',
  'wasn',
  'ma',
  'now',
  'by',
  'them',
  "don't",
  'y',
  'what',
  "doesn't",
  'can',
  'they',
  'being',
  'not',
  'been',
  'no',
  'which',
  'than',
  "won't",
  'mightn',
  'its',
  'during',
  'between',
  'why',
  'because',
  'shouldn',
  'before',
  'or',
  'the',
  "mightn't",
  'against',
  'don',
  "couldn't",
  'hadn',
  'very',
  'does',
  'your',
  'have',
  'but',
  'down',
  'couldn',
  'subreddit',
  'on',
  

--- METRICS for log_reg,tvec ---


{'Model': 'log_reg',
 'Vectorizer': 'tvec',
 'Train Score': 0.8311523582013826,
 'Test Score': 0.7434466429900219,
 'Accuracy': 0.7434466429900219,
 'Specificity': 0.49074502545118,
 'f_score': 0.8147288715192965,
 'ROC_AUC': 0.6898687813823065}

Fitting 3 folds for each of 18 candidates, totalling 54 fits


KeyboardInterrupt: 

In [None]:
pd.DataFrame(tuning_results)

Unnamed: 0,Model,Vectorizer,Train Score,Test Score,Accuracy,Specificity,f_score,ROC_AUC
0,log_reg,cvec,0.934972,0.740487,0.740487,0.555298,0.80555,0.701223
1,log_reg,tvec,0.831152,0.743447,0.743447,0.490745,0.814729,0.689869


In [None]:
# Tune for Random Forest using RandomizedSearchCV to improve model runtime
cvec_rf_gs = clf_model('cvec', 'rf', 3, vec_params=rf_pipe_cvec_params, grid_search=True)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
tvec_rf_gs = clf_model('tvec', 'rf', 3, vec_params=rf_pipe_tvec_params, grid_search=True)


NameError: name 'results' is not defined

In [None]:
pd.DataFrame(tuning_results)

In [None]:
lr_tvec_results = {
    'Model': "Logistic Regression",
    'Vectorizer': "TVEC",
    'Train Score': tvec_lr_gs.score(X_train, y_train),
    'Test Score': tvec_lr_gs.score(X_test, y_test),
    'Accuracy': acc,
    'Specificity': spec,
    'F1 Score': f1_score
}

results_df.append(lr_tvec_results)
pd.DataFrame(results_df)


Unnamed: 0,Model,Vectorizer,Train Score,Test Score,Accuracy,Specificity,F1 Score
0,Logistic Regression,CVEC,0.680676,0.681549,0.670726,0.217955,
1,Logistic Regression,TVEC,0.672114,0.670726,0.670726,0.217955,0.787568


#### 3.2 Random Forest using *TFIFD Vectorizer*