# Setup

In [1]:
# Load pandas and numpy
import pandas as pd
import numpy as np

# For text preprocessing
import spacy
nlp = spacy.load('en_core_web_sm')

# Data Loading and Exploration

In [2]:
# Load the dataset with pandas and display the first 5 rows
filepath = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev.csv'
df = pd.read_csv(filepath)
df.head()


Unnamed: 0,Claim,Evidence,label
0,We should legalize the growing of coca leaf,"Seeing the involvement of the coca growers, th...",0
1,We should limit the use of birth control,"Although FDA-approved for contraceptive use, S...",0
2,We should prohibit flag burning,The case of Texas v. Johnson was appealed to t...,0
3,The vow of celibacy should be abandoned,Much of the encyclical is spent discussing rea...,1
4,We should further exploit natural gas,Helium is typically produced by separating it ...,0


## Check the dataset 

In [3]:
# Check for nan values 
df.isnull().sum()


Claim       0
Evidence    0
label       0
dtype: int64

In [4]:
# Check for duplicates
df.duplicated().sum()
    

0

In [5]:
# Check the shape of the dataset
df.shape

(5926, 3)

# Preprocessing 

In [6]:
# Clean the text using spacy
# This involves stopword removal, lemmatization, and lowercasing and tokensization
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Apply the clean_text function to the text column
df['Claim_Cleaned'] = df['Claim'].apply(clean_text)
df['Evidence_Cleaned'] = df['Evidence'].apply(clean_text)



In [None]:
# Change the order of the columns
df = df[['Claim', 'Claim_Cleaned', 'Evidence', 'Evidence_Cleaned', 'label']]

# Rename the Label column to 'Label'
df.rename(columns={'label':'Label'}, inplace=True)

df.head()

Unnamed: 0,Claim,Claim_Cleaned,Evidence,Evidence_Cleaned,label
0,We should legalize the growing of coca leaf,legalize growing coca leaf,"Seeing the involvement of the coca growers, th...","see involvement coca grower , bolivian governm...",0
1,We should limit the use of birth control,limit use birth control,"Although FDA-approved for contraceptive use, S...","fda - approve contraceptive use , searle marke...",0
2,We should prohibit flag burning,prohibit flag burning,The case of Texas v. Johnson was appealed to t...,case texas v. johnson appeal united states sup...,0
3,The vow of celibacy should be abandoned,vow celibacy abandon,Much of the encyclical is spent discussing rea...,encyclical spend discuss reason paul vi believ...,1
4,We should further exploit natural gas,exploit natural gas,Helium is typically produced by separating it ...,helium typically produce separate natural gas ...,0


In [None]:
# Delete Claim and Evidence columns as they will no longer be used.
del df['Claim']
del df['Evidence']

In [None]:
df.head()

Unnamed: 0,Claim_Cleaned,Evidence_Cleaned,Label
0,legalize growing coca leaf,"see involvement coca grower , bolivian governm...",0
1,limit use birth control,"fda - approve contraceptive use , searle marke...",0
2,prohibit flag burning,case texas v. johnson appeal united states sup...,0
3,vow celibacy abandon,encyclical spend discuss reason paul vi believ...,1
4,exploit natural gas,helium typically produce separate natural gas ...,0


# Model Preparation

#### Convert the data into numerical representaiton.
- We will use all-mpnet-base-v2 BERT model to generate document embeddings. 
- This model exhales in transforming sentences into vector forms. 

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Convert the cleaned text to sentence embeddings
def get_embeddings(text):
    return model.encode(text)

# Apply the get_embeddings function to the cleaned text columns
df['Claim_Embeddings'] = df['Claim_Cleaned'].apply(get_embeddings)
df['Evidence_Embeddings'] = df['Evidence_Cleaned'].apply(get_embeddings)


In [None]:
# Reorder the columns
df = df[['Claim_Cleaned', 'Claim_Embeddings', 'Evidence_Cleaned', 'Evidence_Embeddings', 'Label']]
df.head()

Unnamed: 0,Claim_Cleaned,Claim_Embeddings,Evidence_Cleaned,Evidence_Embeddings,Label
0,legalize growing coca leaf,"[-0.028216107, 0.06747912, 0.005520506, 0.0384...","see involvement coca grower , bolivian governm...","[0.0087185195, 0.05022244, -0.010146592, 0.024...",0
1,limit use birth control,"[-0.017686747, 0.055877153, -0.021929173, -0.0...","fda - approve contraceptive use , searle marke...","[0.0067691784, 0.07828453, -0.013959794, -0.03...",0
2,prohibit flag burning,"[0.015522541, 0.063634604, 0.029161103, 0.0129...",case texas v. johnson appeal united states sup...,"[0.028628858, 0.011332012, 0.028129712, -0.006...",0
3,vow celibacy abandon,"[0.043576196, 0.09004921, 0.013401258, -0.0630...",encyclical spend discuss reason paul vi believ...,"[0.0021371646, -0.019889826, 0.024593318, 0.00...",1
4,exploit natural gas,"[0.00061509537, 0.042601164, 0.005643183, 0.02...",helium typically produce separate natural gas ...,"[0.04620535, -0.00413415, -0.023865238, 0.0318...",0


In [None]:
# Concatenate the Claim and Evidence embeddings to form a single embedding for each row in the dataset
df['Combined_Embeddings'] = df.apply(lambda x: np.concatenate([x['Claim_Embeddings'], x['Evidence_Embeddings']]), axis=1)
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Combined_Embeddings'] = df.apply(lambda x: np.concatenate([x['Claim_Embeddings'], x['Evidence_Embeddings']]), axis=1)


Unnamed: 0,Claim_Cleaned,Claim_Embeddings,Evidence_Cleaned,Evidence_Embeddings,Label,Combined_Embeddings
0,legalize growing coca leaf,"[-0.028216107, 0.06747912, 0.005520506, 0.0384...","see involvement coca grower , bolivian governm...","[0.0087185195, 0.05022244, -0.010146592, 0.024...",0,"[-0.028216107, 0.06747912, 0.005520506, 0.0384..."
1,limit use birth control,"[-0.017686747, 0.055877153, -0.021929173, -0.0...","fda - approve contraceptive use , searle marke...","[0.0067691784, 0.07828453, -0.013959794, -0.03...",0,"[-0.017686747, 0.055877153, -0.021929173, -0.0..."
2,prohibit flag burning,"[0.015522541, 0.063634604, 0.029161103, 0.0129...",case texas v. johnson appeal united states sup...,"[0.028628858, 0.011332012, 0.028129712, -0.006...",0,"[0.015522541, 0.063634604, 0.029161103, 0.0129..."
3,vow celibacy abandon,"[0.043576196, 0.09004921, 0.013401258, -0.0630...",encyclical spend discuss reason paul vi believ...,"[0.0021371646, -0.019889826, 0.024593318, 0.00...",1,"[0.043576196, 0.09004921, 0.013401258, -0.0630..."
4,exploit natural gas,"[0.00061509537, 0.042601164, 0.005643183, 0.02...",helium typically produce separate natural gas ...,"[0.04620535, -0.00413415, -0.023865238, 0.0318...",0,"[0.00061509537, 0.042601164, 0.005643183, 0.02..."


In [None]:
# Drop the Columns which are no longer needed: Claim_Cleaned, Claim_Embeddings, Evidence_Cleaned, Evidence_Embeddings

del df['Claim_Cleaned']
del df['Claim_Embeddings']
del df['Evidence_Cleaned']
del df['Evidence_Embeddings']
df.head()

In [None]:
# Extract X_train and y_train
X_train = np.stack(df['Combined_Embeddings'].to_numpy())
y_train = df['Label'].to_numpy()

# Save the X_train and y_train arrays
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)


In [None]:
# Import the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Load the X_train and y_train arrays
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')

# Initialize the Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)

# Train the classifier
clf.fit(X_train, y_train)

# Evaluate the classifier using cross-validation
scores = cross_val_score(clf, X_train, y_train, cv=5)
print('Cross-validation scores:', scores)
print('Mean cross-validation score:', scores.mean())


Cross-validation scores: [0.806914   0.81772152 0.8185654  0.82109705 0.80759494]
Mean cross-validation score: 0.814378579916181


# Results

In [None]:
# Load the validation dataset
filepath = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev.csv'
df_dev = pd.read_csv(filepath)
df_dev.head()


Unnamed: 0,Claim,Evidence,label
0,We should legalize the growing of coca leaf,"Seeing the involvement of the coca growers, th...",0
1,We should limit the use of birth control,"Although FDA-approved for contraceptive use, S...",0
2,We should prohibit flag burning,The case of Texas v. Johnson was appealed to t...,0
3,The vow of celibacy should be abandoned,Much of the encyclical is spent discussing rea...,1
4,We should further exploit natural gas,Helium is typically produced by separating it ...,0


In [None]:
# A function to prepare the data for the classifier

def prepare_data(df):
    # Clean the text
    df['Claim_Cleaned'] = df['Claim'].apply(clean_text)
    df['Evidence_Cleaned'] = df['Evidence'].apply(clean_text)
    
    # Get the embeddings
    df['Claim_Embeddings'] = df['Claim_Cleaned'].apply(get_embeddings)
    df['Evidence_Embeddings'] = df['Evidence_Cleaned'].apply(get_embeddings)
    
    # Concatenate the embeddings
    df['Combined_Embeddings'] = df.apply(lambda x: np.concatenate([x['Claim_Embeddings'], x['Evidence_Embeddings']]), axis=1)
    
    # Drop the columns
    del df['Claim']
    del df['Evidence']
    del df['Claim_Cleaned']
    del df['Claim_Embeddings']
    del df['Evidence_Cleaned']
    del df['Evidence_Embeddings']
    
    return df

# Prepare the validation dataset
df_dev = prepare_data(df_dev)
df_dev.head()

Unnamed: 0,label,Combined_Embeddings
0,0,"[-0.028216107, 0.06747912, 0.005520506, 0.0384..."
1,0,"[-0.017686747, 0.055877153, -0.021929173, -0.0..."
2,0,"[0.015522541, 0.063634604, 0.029161103, 0.0129..."
3,1,"[0.043576196, 0.09004921, 0.013401258, -0.0630..."
4,0,"[0.00061509537, 0.042601164, 0.005643183, 0.02..."


In [None]:
# Use the trained model to predict
predictions = clf.predict(np.stack(df_dev['Combined_Embeddings'].to_numpy()))
# Add the predictions to the dataframe
df_dev['Predictions'] = predictions

# Reorder the columns
df_dev = df_dev[['Combined_Embeddings', 'label', 'Predictions']]
df_dev.head()

Unnamed: 0,Combined_Embeddings,label,Predictions
0,"[-0.028216107, 0.06747912, 0.005520506, 0.0384...",0,0
1,"[-0.017686747, 0.055877153, -0.021929173, -0.0...",0,0
2,"[0.015522541, 0.063634604, 0.029161103, 0.0129...",0,1
3,"[0.043576196, 0.09004921, 0.013401258, -0.0630...",1,0
4,"[0.00061509537, 0.042601164, 0.005643183, 0.02...",0,0


In [None]:
# Calculate the accuracy
accuracy = accuracy_score(df_dev['label'], df_dev['Predictions'])
print('Accuracy:', accuracy)


Accuracy: 0.8391832602092474


In [None]:
# Save the model
import joblib
joblib.dump(clf, 'model.pkl')

# Save the validation dataset
df_dev.to_csv('dev_predictions.csv', index=False)
