In [5]:
# Load pandas and numpy
import pandas as pd
import numpy as np
# For text preprocessing
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
# Load the dataset with pandas and display the first 5 rows
filepath = '/Users/thebekhruz/Desktop/nlu/EvidenceExplorer/data/validate/dev.csv'
df = pd.read_csv(filepath)
df.head()


Unnamed: 0,Claim,Evidence,label
0,We should legalize the growing of coca leaf,"Seeing the involvement of the coca growers, th...",0
1,We should limit the use of birth control,"Although FDA-approved for contraceptive use, S...",0
2,We should prohibit flag burning,The case of Texas v. Johnson was appealed to t...,0
3,The vow of celibacy should be abandoned,Much of the encyclical is spent discussing rea...,1
4,We should further exploit natural gas,Helium is typically produced by separating it ...,0


### Check the dataset 

In [7]:
# Check for nan values 
df.isnull().sum()


Claim       0
Evidence    0
label       0
dtype: int64

In [8]:
# Check for duplicates
df.duplicated().sum()
    

0

In [9]:
# Check the shape of the dataset
df.shape

(5926, 3)

### Preprocessing 

In [10]:
# Clean the text using spacy
# This involves stopword removal, lemmatization, and lowercasing and tokensization
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Apply the clean_text function to the text column
df['Claim_Cleaned'] = df['Claim'].apply(clean_text)
df['Evidence_Cleaned'] = df['Evidence'].apply(clean_text)



In [12]:
# Change the order of the columns
df = df[['Claim', 'Claim_Cleaned', 'Evidence', 'Evidence_Cleaned', 'label']]
df.head()

Unnamed: 0,Claim,Claim_Cleaned,Evidence,Evidence_Cleaned,label
0,We should legalize the growing of coca leaf,legalize growing coca leaf,"Seeing the involvement of the coca growers, th...","see involvement coca grower , bolivian governm...",0
1,We should limit the use of birth control,limit use birth control,"Although FDA-approved for contraceptive use, S...","fda - approve contraceptive use , searle marke...",0
2,We should prohibit flag burning,prohibit flag burning,The case of Texas v. Johnson was appealed to t...,case texas v. johnson appeal united states sup...,0
3,The vow of celibacy should be abandoned,vow celibacy abandon,Much of the encyclical is spent discussing rea...,encyclical spend discuss reason paul vi believ...,1
4,We should further exploit natural gas,exploit natural gas,Helium is typically produced by separating it ...,helium typically produce separate natural gas ...,0


In [13]:
# Rename the Label column to 'Label'
df.rename(columns={'label':'Label'}, inplace=True)
df.head()

Unnamed: 0,Claim,Claim_Cleaned,Evidence,Evidence_Cleaned,Label
0,We should legalize the growing of coca leaf,legalize growing coca leaf,"Seeing the involvement of the coca growers, th...","see involvement coca grower , bolivian governm...",0
1,We should limit the use of birth control,limit use birth control,"Although FDA-approved for contraceptive use, S...","fda - approve contraceptive use , searle marke...",0
2,We should prohibit flag burning,prohibit flag burning,The case of Texas v. Johnson was appealed to t...,case texas v. johnson appeal united states sup...,0
3,The vow of celibacy should be abandoned,vow celibacy abandon,Much of the encyclical is spent discussing rea...,encyclical spend discuss reason paul vi believ...,1
4,We should further exploit natural gas,exploit natural gas,Helium is typically produced by separating it ...,helium typically produce separate natural gas ...,0


## Building Evidence detection classifier with BERT

### Training BERT

In [14]:
# Import necessary libraries for training the model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [15]:
# Split the data into training and testing sets
X = df['Claim_Cleaned'] + ' ' + df['Evidence_Cleaned']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Create and Train the Model


In [17]:
# Initialize the TF-IDF vectorizer to convert text to feature vectors
tfidf = TfidfVectorizer()

# Transform the training set text to a matrix of TF-IDF features
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the test set text to a matrix of TF-IDF features using the same vectorizer
X_test_tfidf = tfidf.transform(X_test)

# Initialize the Support Vector Classifier (SVC) model
model = SVC()

# Train the SVC model using the TF-IDF features from the training set and the training labels
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))



Accuracy: 0.7731871838111298
Classification Report:               precision    recall  f1-score   support

           0       0.78      0.96      0.86       846
           1       0.76      0.31      0.44       340

    accuracy                           0.77      1186
   macro avg       0.77      0.63      0.65      1186
weighted avg       0.77      0.77      0.74      1186

Confusion Matrix: [[812  34]
 [235 105]]


In [None]:
# 