# Setup

In [1]:
# Load pandas and numpy
import pandas as pd
import numpy as np

# For text preprocessing
import spacy
nlp = spacy.load('en_core_web_sm')

# For text vectorization we will use Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# For the classifier we will use Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# For evaluation we will use accuracy, f1-score, precesion, recall and confusion matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix




In [2]:
# Load the datasets for training and testing
filepath_train = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/train/train.csv'
filepath_test = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev.csv'

df_train = pd.read_csv(filepath_train)
df_test = pd.read_csv(filepath_test)

# Check the first 5 rows of the training dataset
df_train.head()


Unnamed: 0,Claim,Evidence,label
0,We should legalize the growing of coca leaf,"Robert W. Sweet, a federal judge, strongly agr...",1
1,We should ban trans fats usage in food,The net increase in LDL/HDL ratio with trans f...,1
2,We should legalize prostitution,"Pertaining to health, safety and services, the...",0
3,We should subsidize investigative journalism,"Date granted: 10 June 2002 Citation: ""For serv...",0
4,We should abolish homework,The Yarrabah community has a public library wh...,0


# Preprocessing

In [3]:
# Function to create tagged documents
# This is required for Doc2Vec to train the model

# We will use spaCy to tokenize and preprocess the text and then create tagged documents
# Each document is tagged with the index of the row in the dataframe


def create_tagged_document(df):
    tagged_data = []
    for i, text in enumerate(df['Claim'] + ' ' + df['Evidence']):
        # Process the text with the spaCy language model
        doc = nlp(text)
        # Tokenize and lemmatize the text, removing stopwords
        tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
        # Create a TaggedDocument for each row in the dataframe
        tagged_data.append(TaggedDocument(words=tokens, tags=[str(i)]))  # Tags are typically strings
    return tagged_data


tagged_data_train = create_tagged_document(df_train)
tagged_data_train[:5]



[TaggedDocument(words=['legalize', 'grow', 'coca', 'leaf', 'robert', 'w.', 'sweet', 'federal', 'judge', 'strongly', 'agree', 'present', 'policy', 'try', 'prohibit', 'use', 'drug', 'use', 'criminal', 'law', 'mistake', 'ref'], tags=['0']),
 TaggedDocument(words=['ban', 'trans', 'fat', 'usage', 'food', 'net', 'increase', 'ldl', 'hdl', 'ratio', 'trans', 'fat', 'approximately', 'double', 'saturate', 'fat', 'ref'], tags=['1']),
 TaggedDocument(words=['legalize', 'prostitution', 'pertain', 'health', 'safety', 'service', 'report', 'state', 'program', 'include', 'occupational', 'educational', 'program', 'health', 'program', 'continue', 'work', 'prostitute', 'wish', 'transition', 'occupation', 'ref'], tags=['2']),
 TaggedDocument(words=['subsidize', 'investigative', 'journalism', 'date', 'grant', '10', 'june', '2002', 'citation', 'service', 'community', 'investigative', 'journalism', 'western', 'australia', '"[ref'], tags=['3']),
 TaggedDocument(words=['abolish', 'homework', 'yarrabah', 'communi

# Training Doc2Vec Model

In [4]:
# Instantiate a Doc2Vec model
# We will use a simple model with a vector size of 100 and a window size of 2
# We will train the model for 20 epochs

model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=20)

# Build the vocabulary from the tagged documents
model.build_vocab(tagged_data_train)

# Train the model
model.train(tagged_data_train, total_examples=model.corpus_count, epochs=model.epochs)



## Generate Emeddings

In [5]:
# Infer the vectors for the training data
vectors_train = []
for i in range(len(df_train)):
    vectors_train.append(model.infer_vector(tagged_data_train[i].words))

# Convert the list of vectors to a numpy array
vectors_train = np.array(vectors_train)
vectors_train.shape


(23702, 100)

In [6]:
# Exctract features and labels
X = vectors_train
y = df_train['label']


# Train Logistic Regression model
clf = LogisticRegression(random_state=0, max_iter=1000)
clf.fit(X, y)


# Making Predictions and Evaluating the Model


In [7]:
# Predict the labels for the test data
tagged_data_test = create_tagged_document(df_test)
vectors_test = []
for i in range(len(df_test)):
    vectors_test.append(model.infer_vector(tagged_data_test[i].words))
    
vectors_test = np.array(vectors_test)
y_pred = clf.predict(vectors_test)

# Calculate the accuracy
accuracy = accuracy_score(df_test['label'], y_pred)
f1 = f1_score(df_test['label'], y_pred)
precision = precision_score(df_test['label'], y_pred)
recall = recall_score(df_test['label'], y_pred)
conf_matrix = confusion_matrix(df_test['label'], y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')


Accuracy: 0.7323658454269322
F1 Score: 0.291331546023235
Precision: 0.5101721439749609
Recall: 0.2038774233896185
Confusion Matrix:
[[4014  313]
 [1273  326]]


In [8]:
# Save the predicted labels to a CSV file
df_test['predicted_label'] = y_pred
df_test.to_csv('/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev_predicted.csv', index=False)
