In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_trf


In [None]:
!pip install scikit-learn

In [None]:
import pandas as pd
import os
import numpy as np
import spacy
def read_txt_files(directory):
    data = []
    for file in os.listdir(directory):
        file_path = os.path.join(directory,file)
        with open(file_path, 'r') as file:
            data.append(file.read())        
    return data
        
train_deceptive = read_txt_files('data/train/deceptive')
train_truthful = read_txt_files('data/train/truthful')

test_deceptive = read_txt_files('data/test/deceptive')
test_truthful = read_txt_files('data/test/truthful')

df_train_deceptive = pd.DataFrame({'content':train_deceptive, 'labels':np.zeros_like(320)})
df_train_truthful = pd.DataFrame({'content':train_truthful, 'labels':np.ones_like(320)})
df_train = pd.concat([df_train_deceptive,df_train_truthful])

df_train = df_train.replace('\n','', regex=True)

df_test_deceptive = pd.DataFrame({'content':test_deceptive, 'labels':np.zeros_like(80)})
df_test_truthful = pd.DataFrame({'content':test_truthful, 'labels':np.ones_like(80)})
df_test = pd.concat([df_test_deceptive,df_test_truthful])

df_test = df_test.replace('\n','', regex=True)

In [None]:
# TODO: plot figure(s)
#Tokenize & Remove punctuation & Lemmatize & Remove numbers
def tokenize_lemmatize_data(reviews):
    nlp = spacy.load("en_core_web_trf")
    data = []
    for review in reviews:
        data.append([token.lemma_.lower() for token in nlp(review) if not token.is_stop and not token.is_punct])
    return(data)
        
        
data_train = tokenize_lemmatize_data(df_train['content']) 
data_test = tokenize_lemmatize_data(df_test['content']) 

In [None]:
preprocessed_train = [' '.join(text) for text in data_train]
preprocessed_test = [' '.join(text) for text in data_test]


In [None]:
def hyperparamater_tune(clf,param_grid,X_train,y_train):
    kf = KFold(n_splits=5, shuffle = True, random_state=0)
    grid_search = GridSearchCV(
        clf,
        param_grid,
        cv = kf,
        scoring='accuracy'
    )
    
    grid_search.fit(X_train, y_train)
    print("Best parameters:", grid_search.best_params_)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
pipeline = Pipeline([('vectorizer', CountVectorizer()), 
    ('classifier', MultinomialNB()) ])




In [None]:
from sklearn.model_selection import GridSearchCV
kf = KFold(n_splits=5, shuffle = True, random_state=0)
param_grid_nb = {
    'vectorizer__max_features': [100, 200,300,400,500,600,700,800,900,1000,2000,4000,6000,None],
    'vectorizer__ngram_range': [(1, 1)],
    'classifier__alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0],
    'classifier__fit_prior': [True, False]
    
}
"""
grid_search = GridSearchCV(
    pipeline,
    param_grid_nb,
    cv = kf,
    scoring='accuracy'
)
"""
hyperparamater_tune(pipeline, param_grid_nb, preprocessed_train, df_train['labels'])               

In [None]:
from sklearn.model_selection import GridSearchCV
kf = KFold(n_splits=5, shuffle = True, random_state=0)
param_grid_nb = {
    'vectorizer__max_features': [100, 200,300,400,500,600,700,800,900,1000,2000,4000,6000,None],
    'vectorizer__ngram_range': [(1, 2)],
    'classifier__alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0],
    'classifier__fit_prior': [True, False]
    
}
"""
grid_search = GridSearchCV(
    pipeline,
    param_grid_nb,
    cv = kf,
    scoring='accuracy'
)
"""
hyperparamater_tune(pipeline, param_grid_nb, preprocessed_train, df_train['labels'])               

In [None]:
#MultinomialNB optimized unigram
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


vectorizer = CountVectorizer(max_features = 900, ngram_range = (1,1))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = MultinomialNB(alpha=1, fit_prior=True)
clf.fit(X_train, df_train['labels'])
y_pred_nb_u = clf.predict(X_test)
print(accuracy_score(df_test['labels'], y_pred_nb_u))
print(recall_score(df_test['labels'],y_pred_nb_u))
print(precision_score(df_test['labels'],y_pred_nb_u))
print(f1_score(df_test['labels'],y_pred_nb_u))

In [None]:
feature_names = vectorizer.get_feature_names_out()

def get_feature_importance_nb(vectorizer, nb_model):
    # Get feature log probabilities for each class
    feature_prob = nb_model.feature_log_prob_
    
    # Convert log probabilities to actual probabilities
    feature_prob = np.exp(feature_prob)
    
    # Calculate difference in probabilities between classes
    # This shows how discriminative each feature is
    prob_diff = np.abs(feature_prob[1] - feature_prob[0])
    
    # Create DataFrame with features and their importance
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Class_0_Prob': feature_prob[0],
        'Class_1_Prob': feature_prob[1],
        'Importance': prob_diff
    })
    
    return importance_df.sort_values('Importance', ascending=False)

# Get feature importance
importance_df = get_feature_importance_nb(vectorizer, clf)


In [None]:
importance_df

In [None]:
feature_names = np.array(vectorizer.get_feature_names_out())

# For each class
for i, class_label in enumerate(clf.classes_):
    # Get the feature log probabilities for this class
    log_probs = np.exp(clf.feature_log_prob_[i])
    
    # Get the indices of top n features
    top_indices = np.argsort(log_probs)[-5:][::-1]
    
    # Create a DataFrame with feature names and their probabilities
    top_features_df = pd.DataFrame({
        'Feature': feature_names[top_indices],
        'Probability': log_probs[top_indices]
    })
    
    print(f"\nTop {5} features for class {class_label}:")
    print(top_features_df.to_string(index=False))


In [None]:
importance_df.head(5)

In [None]:
#MultinomialNB optimized unigram+bigram
vectorizer = CountVectorizer(max_features = 900, ngram_range = (1,2))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = MultinomialNB(alpha=0.6, fit_prior=True)
clf.fit(X_train, df_train['labels'])
y_pred_nb_b = clf.predict(X_test)
#print(accuracy_score(df_test['labels'], y_pred))
#print(recall_score(df_test['labels'],y_pred))
#print(precision_score(df_test['labels'],y_pred))
#print(f1_score(df_test['labels'],y_pred))


In [None]:
!pip install plotly==5.24.1

In [None]:
import plotly.express as px
import pandas as pd

# Create sample data
data = {
    'Metrics': ['Accuracy', 'Precision', 'Recall', 'F1-Score'] * 2,
    'Model': ['Unigram'] * 4 + ['Unigram + Bigram'] * 4,
    'Values': [0.89375, 0.925, 0.87059, 0.89697,  # Model A values
               0.8625, 0.95, 0.80851, 0.87356]   # Model B values
}

# Create dataframe
df = pd.DataFrame(data)

# Create the plot
fig = px.bar(df,
             x='Metrics',
             y='Values',
             color='Model',
             text= data['Values'],
             title='Multinomial Naive Bayes classifiers',
             barmode='group')

fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=32,
        color="RebeccaPurple"
    )
)

# Save as HTML file
fig.write_html("naive_bayes.html")

# Show plot
fig.show()

# Optional: Save as static image
fig.write_image("naive_bayes.png")

In [None]:
!pip install -U kaleido


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range = (1,1))), 
    ('classifier', LogisticRegression()) ])
param_grid_logreg = {
    'classifier__C': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
}

hyperparamater_tune(pipeline, param_grid_logreg, preprocessed_train, df_train['labels'])         

vectorizer = CountVectorizer(ngram_range = (1,1))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = LogisticRegression(C=0.1)
clf.fit(X_train, df_train['labels'])
y_pred_logres = clf.predict(X_test)
print(accuracy_score(df_test['labels'], y_pred_logres))
print(recall_score(df_test['labels'],y_pred_logres))
print(precision_score(df_test['labels'],y_pred_logres))
print(f1_score(df_test['labels'],y_pred_logres))


In [None]:
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range = (1,2))), 
    ('classifier', LogisticRegression()) ])

hyperparamater_tune(pipeline, param_grid_logreg, preprocessed_train, df_train['labels'])         

In [None]:
vectorizer = CountVectorizer(ngram_range = (1,2))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = LogisticRegression(C=0.7)
clf.fit(X_train, df_train['labels'])
y_pred = clf.predict(X_test)
print(accuracy_score(df_test['labels'], y_pred))
print(recall_score(df_test['labels'],y_pred))
print(precision_score(df_test['labels'],y_pred))
print(f1_score(df_test['labels'],y_pred))

In [None]:
data = {
    'Metrics': ['Accuracy', 'Precision', 'Recall', 'F1-Score'] * 2,
    'Model': ['Unigram'] * 4 + ['Unigram + Bigram'] * 4,
    'Values': [0.88125, 0.9375, 0.84270, 0.88757,  # Model A values
               0.8625, 0.9125, 0.82954, 0.87905]   # Model B values
}

# Create dataframe
df = pd.DataFrame(data)

# Create the plot
fig = px.bar(df,
             x='Metrics',
             y='Values',
             color='Model',
             text = data['Values'],
             title='Logistic Regression classifiers',
             barmode='group')

fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=32,
        color="RebeccaPurple"
    )
)

# Save as HTML file
fig.write_html("Logistic.html")

# Show plot
fig.show()

# Optional: Save as static image
fig.write_image("Logistic.png")

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_nofs,df_train['labels'])
clf.score(X_test_nofs,df_test['labels'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range = (1,1))), 
    ('classifier', DecisionTreeClassifier(random_state = 0)) ])
param_grid_dt = {
    'classifier__min_samples_leaf': np.arange(1,105,2),
    'classifier__min_samples_split': np.arange(2,105,2)
}

hyperparamater_tune(pipeline, param_g33rid_dt, preprocessed_train, df_train['labels'])         



In [None]:
vectorizer = CountVectorizer(ngram_range = (1,1))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = DecisionTreeClassifier(random_state = 0, min_samples_leaf = 66, min_samples_split = 2)
clf.fit(X_train, df_train['labels'])
y_pred_dt= clf.predict(X_test)
print(accuracy_score(df_test['labels'], y_pred_dt))
print(recall_score(df_test['labels'],y_pred_dt))
print(precision_score(df_test['labels'],y_pred_dt))
print(f1_score(df_test['labels'],y_pred_dt))

In [None]:
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range = (1,2))), 
    ('classifier', DecisionTreeClassifier(random_state = 0)) ])
param_grid_dt = {
    'classifier__min_samples_leaf': np.arange(1,105,2),
    'classifier__min_samples_split': np.arange(2,105,2)
}

hyperparamater_tune(pipeline, param_grid_dt, preprocessed_train, df_train['labels'])         



In [None]:
vectorizer = CountVectorizer(ngram_range = (1,2))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = DecisionTreeClassifier(random_state = 0, min_samples_leaf = 1, min_samples_split = 4)
clf.fit(X_train, df_train['labels'])
y_pred = clf.predict(X_test)
print(accuracy_score(df_test['labels'], y_pred))
print(recall_score(df_test['labels'],y_pred))
print(precision_score(df_test['labels'],y_pred))
print(f1_score(df_test['labels'],y_pred))

In [None]:
data = {
    'Metrics': ['Accuracy', 'Precision', 'Recall', 'F1-Score'] * 2,
    'Model': ['Unigram'] * 4 + ['Unigram + Bigram'] * 4,
    'Values': [0.61875, 0.7125, 0.6, 0.6514,  # Model A values
               0.6625, 0.7375, 0.6413, 0.6860]   # Model B values
}

# Create dataframe
df = pd.DataFrame(data)

# Create the plot
fig = px.bar(df,
             x='Metrics',
             y='Values',
             color='Model',
             text = data['Values'],
             title='Decision Tree classifiers',
             barmode='group')

fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=32,
        color="RebeccaPurple"
    )
)

# Save as HTML file
fig.write_html("DT.html")

# Show plot
fig.show()

# Optional: Save as static image
fig.write_image("DT.png")

In [None]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range = (1,1))), 
    ('classifier', RandomForestClassifier(random_state = 0)) ])
param_grid_rf = {
    'classifier__min_samples_leaf': np.arange(1,101,10),
    'classifier__n_estimators': [50,100,150,200]
}

hyperparamater_tune(pipeline, param_grid_rf, preprocessed_train, df_train['labels'])         



In [None]:
vectorizer = CountVectorizer(ngram_range = (1,1))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = RandomForestClassifier(random_state = 0, min_samples_leaf = 1, n_estimators = 100)
clf.fit(X_train, df_train['labels'])
y_pred_rf_u = clf.predict(X_test)
#print(accuracy_score(df_test['labels'], y_pred))
#print(recall_score(df_test['labels'],y_pred))
#print(precision_score(df_test['labels'],y_pred))
#print(f1_score(df_test['labels'],y_pred))

In [None]:
pipeline = Pipeline([('vectorizer', CountVectorizer(ngram_range = (1,2))), 
    ('classifier', RandomForestClassifier(random_state = 0)) ])
param_grid_rf = {
    'classifier__min_samples_leaf': np.arange(1,101,10),
    'classifier__n_estimators': [50,100,150,200]
}

hyperparamater_tune(pipeline, param_grid_rf, preprocessed_train, df_train['labels'])         



In [None]:
vectorizer = CountVectorizer(ngram_range = (1,2))
X_train = vectorizer.fit_transform(preprocessed_train)
X_test = vectorizer.transform(preprocessed_test)
clf = RandomForestClassifier(random_state = 0, min_samples_leaf = 1, n_estimators = 150)
clf.fit(X_train, df_train['labels'])
y_pred_rf_b = clf.predict(X_test)
print(accuracy_score(df_test['labels'], y_pred))
print(recall_score(df_test['labels'],y_pred))
print(precision_score(df_test['labels'],y_pred))
print(f1_score(df_test['labels'],y_pred))

In [None]:
data = {
    'Metrics': ['Accuracy', 'Precision', 'Recall', 'F1-Score'] * 2,
    'Model': ['Unigram'] * 4 + ['Unigram + Bigram'] * 4,
    'Values': [0.79375, 0.7625, 0.8133, 0.7871,  # Model A values
               0.79375, 0.9, 0.7423, 0.8136]   # Model B values
}

# Create dataframe
df = pd.DataFrame(data)

# Create the plot
fig = px.bar(df,
             x='Metrics',
             y='Values',
             text = data['Values'],
             color='Model',
             title='Random Forest classifiers',
             barmode='group')

fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=32,
        color="RebeccaPurple"
    )
)

# Save as HTML file
fig.write_html("rf.html")

# Show plot
fig.show()

# Optional: Save as static image
fig.write_image("rf.png")

In [None]:
from sklearn.naive_bayes import MultinomialNB
def train_and_eval(clf, X,y, X_t, y_t):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X)
    X_test = vectorizer.transform(X_t)
    clf.fit(X_train, y)
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_t, y_pred))

    

In [None]:
pip install mlxtend

In [None]:
from mlxtend.evaluate import mcnemar_table
from mlxtend.evaluate import mcnemar
chi2, p_value = mcnemar(mcnemar_table(df_test['labels'], 
                                      np.array(y_pred_nb_u), 
                                      np.array(y_pred_nb_b)),
                        corrected=False)
print('Q: %.3f' % chi2)
print('p-value: %.3f' % p_value)

In [None]:
chi2, p_value = mcnemar(mcnemar_table(df_test['labels'], 
                                      np.array(y_pred_logres), 
                                      np.array(y_pred)),
                        corrected=False)
print('Q: %.3f' % chi2)
print('p-value: %.3f' % p_value)

In [None]:
chi2, p_value = mcnemar(mcnemar_table(df_test['labels'], 
                                      np.array(y_pred_dt), 
                                      np.array(y_pred)),
                        corrected=False)
print('Q: %.3f' % chi2)
print('p-value: %.3f' % p_value)y_pred_nb_u

In [None]:
chi2, p_value = mcnemar(mcnemar_table(df_test['labels'], 
                                      np.array(y_pred_logres), 
                                      np.array(y_pred_nb_u)),
                        corrected=False)
print('Q: %.3f' % chi2)
print('p-value: %.3f' % p_value)

In [None]:
chi2, p_value = mcnemar(mcnemar_table(df_test['labels'], 
                                      np.array(y_pred_logres), 
                                      np.array(y_pred_rf_u)),
                        corrected=False)
print('Q: %.3f' % chi2)
print('p-value: %.3f' % p_value)