#### Import

In [19]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import sklearn
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import classification_report

In [22]:
#LIAR = pd.read_csv("/content/sample_data/LIAR.csv", index_col=0)
df = pd.read_csv("LIAR.csv", index_col=0)
columns = ["id",
           "label",
           "statement",
           "subject",
           "speaker",
           "job_title",
           "state_info",
           "party_affiliation",
           "barely_true_counts",
           "false_counts",
           "half_true_counts",
           "mostly_true_counts",
           "pants_on_fire_counts",
           "context",
           "justification"]
df.columns = columns

#### Deal with Missing Values

In [23]:
# Define columns with missing values
columns_with_missing = ['subject', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 
                        'mostly_true_counts', 'pants_on_fire_counts', 'context', 'justification']

# Drop rows with missing values in specified columns
df.dropna(subset=columns_with_missing, inplace=True)

# Fill missing values in 'job_title' column
df['job_title'].fillna(pd.Series(np.random.choice(df['job_title'].dropna(), size=len(df))), inplace=True)

# Fill missing values in 'state_info' column
df['state_info'].fillna(pd.Series(np.random.choice(df['state_info'].dropna(), size=len(df))), inplace=True)
df.state_info.describe()

count     12565
unique       83
top       Texas
freq       1568
Name: state_info, dtype: object

In [31]:
df.job_title.describe()

count         12565
unique         1343
top       President
freq            822
Name: job_title, dtype: object

#### Map to binary outcome

In [32]:
def mapping_label(s):
    label_mapping = {"pants-fire": 0, "false": 0, "barely-true": 0, "half-true": 0, "mostly-true": 1, "true": 1}
    return label_mapping.get(s.lower(), -1)

df['integer_label'] = df.label.apply(mapping_label)
all_subjects = df.subject.str.split(',',expand = True)
all_subjects = pd.DataFrame(np.array(all_subjects.iloc[:,:4]),columns = ['subject1','subject2','subject3','subject4'])
df = pd.concat([df.reset_index(),all_subjects.reset_index()],axis=1)  #added reset_index to avoid incresing row #
df = df.drop(['label','subject','index'],axis = 1)

#### Encoding Categorical Variables

In [49]:
def top_n_encoder(series, n,prefix = None):
    '''
    series: a pd series
    n: number of top frequent categories to be encoded
    '''
    series = series.copy()
    counts = series.value_counts()
    mask = series.isin(counts.iloc[:n].index)
    series[~mask] = "other"
    return pd.get_dummies(series,prefix = prefix)

encoded_job_title = top_n_encoder(df.job_title.str.lower(),n = 100,prefix = "job")
encoded_party = top_n_encoder(df.party_affiliation.str.lower(),n = 100,prefix = "party")
encoded_speaker = top_n_encoder(df.speaker.str.lower(),n = 100,prefix = "speaker")
encoded_state_info = top_n_encoder(df.state_info.str.lower(),n = 100,prefix = "state_info") #added this line
encoded_context = top_n_encoder(df.context.str.lower(), n = 150,prefix = "context")
encoded_subject1 = top_n_encoder(df.subject1.str.lower(),n = 150, prefix = "subject1")
encoded_subject2 = top_n_encoder(df.subject2.str.lower(),n = 150,prefix = "subject2")
encoded_subject3 = top_n_encoder(df.subject3.str.lower(),n = 150,prefix = "subject3")
encoded_subject4 = top_n_encoder(df.subject4.str.lower(),n = 150,prefix = "subject4")

categorical_features = pd.concat([encoded_job_title,encoded_party,encoded_speaker,encoded_context,
                                  encoded_subject1,encoded_subject2,encoded_subject3,encoded_subject4],axis = 1)
df = pd.concat([df.reset_index(),categorical_features.reset_index()],axis=1) # concat encoded cols
df = df.drop(columns=['job_title','party_affiliation','speaker','state_info','context','index',
                      'subject1','subject2','subject3','subject4']) # drop oringinal cols

#### Cleaning the statements

In [50]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

df['statement_clean'] = df.statement.apply(clean_text)
df = df.drop(columns=['statement'])

#### split data

In [51]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['integer_label','id'])
y = df['integer_label']
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.25, random_state=0)

#### tokenize the statements

In [52]:
#Convert a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer() 
X_text_train = vectorizer.fit_transform(X_train.statement_clean)
X_text_val = vectorizer.transform(X_val.statement_clean)
X_text_test = vectorizer.transform(X_test.statement_clean)

#Transform a count matrix to a normalized tf or tf-idf representation
tfidf = TfidfTransformer() 
X_text_train = tfidf.fit_transform(X_text_train)
X_text_val = tfidf.transform(X_text_val)
X_text_test = tfidf.transform(X_text_test)

In X, the original 'statement' is cleaned and stored in column 'statement_clean', <br>
all other columsn are numerical including those encoded from categorical variables.<br>
'statement_clean' variable is further tokenized and splitted into X_text_train, X_text_val, X_text_test.
#### Split Data

In [53]:
# use % store -r [variable_name] to access the variables
%store X_text_train
%store X_text_val
%store X_text_test
%store X_train
%store X_val
%store X_test
%store y_train
%store y_val
%store y_test

Stored 'X_text_train' (csr_matrix)
Stored 'X_text_val' (csr_matrix)
Stored 'X_text_test' (csr_matrix)
Stored 'X_train' (DataFrame)
Stored 'X_val' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_train' (Series)
Stored 'y_val' (Series)
Stored 'y_test' (Series)


### Baseline Model: Logistic Regression

In [60]:
logr = LogisticRegression()
logr.fit(X_text_train, y_train)

logr_pred_train = logr.predict(X_text_train)
logr_pred_val = logr.predict(X_text_val)
logr_pred_test = logr.predict(X_text_test)

print("train score:", logr.score(X_text_train, y_train))
print("val score:", logr.score(X_text_val, y_val))
print("test score:", logr.score(X_text_test, y_test))  

train score: 0.8104265402843602
val score: 0.6022906793048973
test score: 0.6078199052132701


### Baseline Model: Support Vector Machine (SVM)

In [59]:
clf = svm.SVC()
clf.fit(X_text_train, y_train)

pred_train = clf.predict(X_text_train)
pred_val = clf.predict(X_text_val)
pred_test = clf.predict(X_text_test)

print("train score:", clf.score(X_text_train, y_train))
print("val score:", clf.score(X_text_val, y_val))
print("test score:", clf.score(X_text_test, y_test))     

train score: 0.9636650868878357
val score: 0.6113744075829384
test score: 0.6216429699842022


In [61]:
rf = RandomForestClassifier(random_state=84)
rf.fit(X_text_train, y_train)

print("test performance:", rf.score(X_text_test, y_test))

test performance: 0.6097946287519748


In [63]:

logr_cv = GridSearchCV(logr,{'alpha':('linear', 'rbf'), 'C':np.logspace(-3,3,20)})

svm_cv.fit(X_text_train, y_train)

print(classification_report(y_test, logr_pred_test))
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.57      0.42      0.48      1113
           1       0.62      0.76      0.68      1419

    accuracy                           0.61      2532
   macro avg       0.60      0.59      0.58      2532
weighted avg       0.60      0.61      0.60      2532

              precision    recall  f1-score   support

           0       0.61      0.38      0.47      1113
           1       0.62      0.81      0.71      1419

    accuracy                           0.62      2532
   macro avg       0.62      0.60      0.59      2532
weighted avg       0.62      0.62      0.60      2532

