# Setup

In [1]:
# Load pandas and numpy
import pandas as pd
import numpy as np

# For text preprocessing
import spacy
nlp = spacy.load('en_core_web_sm')

# For text vectorization we will use Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# For the classifier we will use Logistic Regression
from sklearn.linear_model import LogisticRegression

# For the second classifier we will use Random Forest
from sklearn.ensemble import RandomForestClassifier

# For the third classifier we will use Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier



In [2]:
# Load the dataset for training
filepath_train = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/train/train.csv'
# filepath_train = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/train/trainX.csv'

df_train = pd.read_csv(filepath_train)

# Check the first 5 rows of the training dataset
df_train.head()


Unnamed: 0,Claim,Evidence,label
0,We should legalize the growing of coca leaf,"Robert W. Sweet, a federal judge, strongly agr...",1
1,We should ban trans fats usage in food,The net increase in LDL/HDL ratio with trans f...,1
2,We should legalize prostitution,"Pertaining to health, safety and services, the...",0
3,We should subsidize investigative journalism,"Date granted: 10 June 2002 Citation: ""For serv...",0
4,We should abolish homework,The Yarrabah community has a public library wh...,0


# Preprocessing

In [3]:
# Function to create tagged documents
# This is required for Doc2Vec to train the model

# We will use spaCy to tokenize and preprocess the text and then create tagged documents
# We will concatenate the Claim and Evidence using Delimiters to differentiate between the Claim and Evidence

def create_tagged_document(df):
    tagged_documents = []  # Initialize an empty list to store TaggedDocument objects
    for i, row in df.iterrows():
        claim = row['Claim']
        evidence = row['Evidence']
        claim_tokens = [token.lemma_.lower() for token in nlp(claim) if not token.is_stop and not token.is_punct]
        evidence_tokens = [token.lemma_.lower() for token in nlp(evidence) if not token.is_stop and not token.is_punct]
        tagged_document = TaggedDocument(words=['CLAIM'] + claim_tokens + ['EVIDENCE'] + evidence_tokens, tags=[str(i)])
        tagged_documents.append(tagged_document)  # Append each TaggedDocument to the list
    return tagged_documents  # Return the list of TaggedDocument objects at the end


# Call the function to create tagged documents
tagged_data_train = create_tagged_document(df_train)
tagged_data_train[:2]
# Takes around 3 min

[TaggedDocument(words=['CLAIM', 'legalize', 'growing', 'coca', 'leaf', 'EVIDENCE', 'robert', 'w.', 'sweet', 'federal', 'judge', 'strongly', 'agree', 'present', 'policy', 'try', 'prohibit', 'use', 'drug', 'use', 'criminal', 'law', 'mistake', 'ref'], tags=['0']),
 TaggedDocument(words=['CLAIM', 'ban', 'trans', 'fat', 'usage', 'food', 'EVIDENCE', 'net', 'increase', 'ldl', 'hdl', 'ratio', 'trans', 'fat', 'approximately', 'double', 'saturate', 'fat', 'ref'], tags=['1'])]

# Training Doc2Vec Model

### Finding the optimal parameters.

In [4]:
# Instantiate a Doc2Vec model
# We will use a simple model with a vector size of 100 and a window size of 2
# We will train the model for 20 epochs

# Best Params: {'vector_size': 600, 'window': 3, 'min_count': 10, 'epochs': 30, 'workers': 4}
def train_doc2vec_model(tagged_data, vector_size=600, window=3, min_count=10, epochs=30, workers=4):
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers, epochs=epochs)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=epochs)
    return model

# Call the funciton to train the model
model = train_doc2vec_model(tagged_data_train)





## Generate Emeddings

In [5]:
# Infer the vectors for the training data
def infer_vectors(model, tagged_documents):
    vectors = [model.infer_vector(doc.words) for doc in tagged_documents]
    return np.array(vectors)

# Infer the vectors for the training data
vectors_train = infer_vectors(model, tagged_data_train)
vectors_train.shape


(23702, 600)

# Train Logistic Regression Model

In [6]:
# Exctract features and labels
X = vectors_train
y = df_train['label']


# Train Logistic Regression model
# We will try to tune the hyperparameters using GridSearchCV
# Best params    LogisticRegression(random_state=42, max_iter=10000, C=120, penalty='l2', solver='liblinear'),

clf = LogisticRegression(random_state=42, max_iter=10000, C=120, penalty='l2', solver='liblinear')
clf.fit(X, y)





# Train Random Forest

In [7]:
# Random Forest Classifier
# We will try to tune the hyperparameters using GridSearchCV
# The best hyperparameters are:
#   RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=2), #

clf_rf = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=2)
clf_rf.fit(X, y)

# Train Gradient Boosting Classifier

In [9]:
# Gradient Boosting Classifier
# We will try to tune the hyperparameters using GridSearchCV

clf_gb = GradientBoostingClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=30,
    learning_rate=0.4,
    min_samples_split=6,
    subsample=0.8
)
clf_gb.fit(X, y)

# Takes around 18 min

In [11]:
# Save all the models.
import joblib
joblib.dump(clf, 'logistic_regression_model.pkl')
joblib.dump(clf_rf, 'random_forest_model.pkl')
joblib.dump(clf_gb, 'gradient_boosting_model.pkl')

# Save the Doc2Vec model
model.save('doc2vec_model')


# Code for finding the best hyperparameters. 

In [None]:
# from sklearn.model_selection import train_test_split
# from gensim.models.doc2vec import TaggedDocument, Doc2Vec
# from sklearn.metrics import accuracy_score
# # Import your classifier here
# from sklearn.linear_model import LogisticRegression
# import itertools

# def train_doc2vec_and_classify(tagged_data_train, labels, doc2vec_params, classifier_params):
#     # Train Doc2Vec model
#     model = Doc2Vec(**doc2vec_params)
#     model.build_vocab(tagged_data_train)
#     model.train(tagged_data_train, total_examples=model.corpus_count, epochs=doc2vec_params['epochs'])
    
#     # Generate vectors for classifier
#     X = [model.dv[i] for i in range(len(tagged_data_train))]
#     y = labels
    
#     # Split data
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
#     # Train classifier
#     classifier = LogisticRegression(**classifier_params)
#     classifier.fit(X_train, y_train)
    
#     # Evaluate
#     y_pred = classifier.predict(X_test)
#     score = accuracy_score(y_test, y_pred)
    
#     return score

# # Example hyperparameter ranges
# vector_sizes = [500, 600, 700]
# windows = [3]
# min_counts = [8, 10, 12]
# epochs = [30]
# # Classifier parameters example
# classifier_params = {'max_iter': 100}

# # Prepare your tagged data and labels
# # tagged_data_train = [...]
# # labels = [...]

# best_score = 0
# best_params = {}

# # Iterate over all combinations
# for combo in itertools.product(vector_sizes, windows, min_counts, epochs):
#     vs, win, mc, ep = combo
#     params = {'vector_size': vs, 'window': win, 'min_count': mc, 'epochs': ep, 'workers': 4}
#     score = train_doc2vec_and_classify(tagged_data_train, y, params, classifier_params)
    
#     if score > best_score:
#         best_score = score
#         best_params = params

# print(f"Best Score: {best_score}")
# print(f"Best Params: {best_params}")


In [None]:
# # Load the dataset for testing
# filepath_test = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev.csv'
# df_test = pd.read_csv(filepath_test)

# tagged_data_test = create_tagged_document(df_test)
# vectors_test = infer_vectors(model, tagged_data_test)


In [None]:
# from sklearn.metrics import roc_curve
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# clf = [
#     # LogisticRegression(random_state=42, max_iter=10000, C=120, penalty='None', solver='liblinear'),
#     LogisticRegression(random_state=42, max_iter=10000, C=120, penalty='l2', solver='liblinear'),
#     LogisticRegression(random_state=42, max_iter=10000, C=120, penalty='l1', solver='liblinear'),
# ]
# clf_columns = ['Accuracy', 'F1 Score', 'Precision', 'Recall']
# clf_compare = pd.DataFrame(columns=clf_columns)  # Define the columns for clf_compare DataFrame

# # Train the models with different solvers and then output the roc_curve, accuracy, f1_score, precision, recall, and confusion matrix
# for i in clf:
#     i.fit(X, y)
#     name = i.__class__.__name__
    
#     print("="*30)
#     print(name)
    
#     print('****Results****')
#     y_pred = i.predict(vectors_test)
#     acc = accuracy_score(df_test['label'], y_pred)
#     f1 = f1_score(df_test['label'], y_pred, average='weighted')
#     prec = precision_score(df_test['label'], y_pred, average='weighted')
#     recall = recall_score(df_test['label'], y_pred, average='weighted')
#     print("Accuracy: {:.4%}".format(acc))
#     print("F1 Score: {:.4}".format(f1))
#     print("Precision: {:.4}".format(prec))
#     print("Recall: {:.4}".format(recall))
#     print("Confusion Matrix:")
#     print(confusion_matrix(df_test['label'], y_pred))
    
#     fpr, tpr, thresholds = roc_curve(df_test['label'], y_pred)
#     clf_compare.loc[name] = [acc, f1, prec, recall]
    
#     print("="*30)



In [None]:
# # Evaluting different hyperparameters for Random Forest Classifier
# # Archive
#     # RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='gini'),
#     # RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='log_loss'),
#     # RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=2),
#     # RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='log_loss', min_samples_split=2)
# clf_rf = [
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy'), #
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=2), #
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=4), #
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=6), #
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='entropy', min_samples_split=10), #
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='gini'), # 
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='gini', min_samples_split=2), # 
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='gini', min_samples_split=4), # 
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='gini', min_samples_split=6), # 
#     RandomForestClassifier(random_state=42, n_estimators=20, max_depth=40, criterion='gini', min_samples_split=10), # 


# ]
# clf_rf_columns = ['Accuracy', 'F1 Score', 'Precision', 'Recall']
# clf_rf_compare = pd.DataFrame(columns=clf_rf_columns)  # Define the columns for clf_compare DataFrame

# # Train the models with different solvers and then output the roc_curve, accuracy, f1_score, precision, recall, and confusion matrix
# for i in clf_rf:
#     print(f'Fitting {i}')
#     i.fit(X, y)
#     name = i.__class__.__name__
    
#     print("="*30)
#     print(name)
    
#     print('****Results****')
#     y_pred = i.predict(vectors_test)
#     acc = accuracy_score(df_test['label'], y_pred)
#     f1 = f1_score(df_test['label'], y_pred, average='weighted')
#     prec = precision_score(df_test['label'], y_pred, average='weighted')
#     recall = recall_score(df_test['label'], y_pred, average='weighted')
#     print("Accuracy: {:.4%}".format(acc))
#     print("F1 Score: {:.4}".format(f1))
#     print("Precision: {:.4}".format(prec))
#     print("Recall: {:.4}".format(recall))
#     print("Confusion Matrix:")
#     print(confusion_matrix(df_test['label'], y_pred))
    
#     fpr, tpr, thresholds = roc_curve(df_test['label'], y_pred)
#     clf_rf_compare.loc[name] = [acc, f1, prec, recall]
    
#     print("="*30)

In [None]:
# # Evaluting different hyperparameters for Gradient Boosting Classifier

# clf_gb = [
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=20),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=30),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=40),
#     # Examining Learning Rate
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.1),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.2),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.3),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.4),
#     # Examining Min Samples Split
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=2),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=4),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=6),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=10),
#     # Examining subsample
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, subsample=0.8),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, subsample=0.9),
#     GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=10, subsample=1.0),

# ]

# clf_gb_columns = ['Accuracy', 'F1 Score', 'Precision', 'Recall']
# clf_gb_compare = pd.DataFrame(columns=clf_gb_columns)  # Define the columns for clf_compare DataFrame

# # Train the models with different solvers and then output the roc_curve, accuracy, f1_score, precision, recall, and confusion matrix

# for i in clf_gb:
#     print(f'Fitting {i}')
#     i.fit(X, y)
#     name = i.__class__.__name__
    
#     print("="*30)
#     print(name)
    
#     print('****Results****')
#     y_pred = i.predict(vectors_test)
#     acc = accuracy_score(df_test['label'], y_pred)
#     f1 = f1_score(df_test['label'], y_pred, average='weighted')
#     prec = precision_score(df_test['label'], y_pred, average='weighted')
#     recall = recall_score(df_test['label'], y_pred, average='weighted')
#     print("Accuracy: {:.4%}".format(acc))
#     print("F1 Score: {:.4}".format(f1))
#     print("Precision: {:.4}".format(prec))
#     print("Recall: {:.4}".format(recall))
#     print("Confusion Matrix:")
#     print(confusion_matrix(df_test['label'], y_pred))
    
#     fpr, tpr, thresholds = roc_curve(df_test['label'], y_pred)
#     clf_gb_compare.loc[name] = [acc, f1, prec, recall]
    
#     print("="*30)

Fitting GradientBoostingClassifier(max_depth=10, random_state=42)
GradientBoostingClassifier
****Results****
Accuracy: 67.0000%
F1 Score: 0.6312
Precision: 0.6119
Recall: 0.67
Confusion Matrix:
[[63 10]
 [23  4]]
Fitting GradientBoostingClassifier(max_depth=20, random_state=42)
GradientBoostingClassifier
****Results****
Accuracy: 63.0000%
F1 Score: 0.6359
Precision: 0.6427
Recall: 0.63
Confusion Matrix:
[[53 20]
 [17 10]]
Fitting GradientBoostingClassifier(max_depth=30, random_state=42)
GradientBoostingClassifier
****Results****
Accuracy: 64.0000%
F1 Score: 0.6439
Precision: 0.6483
Recall: 0.64
Confusion Matrix:
[[54 19]
 [17 10]]
Fitting GradientBoostingClassifier(max_depth=40, random_state=42)
GradientBoostingClassifier
****Results****
Accuracy: 59.0000%
F1 Score: 0.5965
Precision: 0.6039
Recall: 0.59
Confusion Matrix:
[[51 22]
 [19  8]]
Fitting GradientBoostingClassifier(max_depth=10, random_state=42)
GradientBoostingClassifier
****Results****
Accuracy: 67.0000%
F1 Score: 0.6312
Pre