In [32]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk  import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.sparse import hstack

In [2]:
np.random.seed(500)

In [3]:
train_corpus = pd.read_csv(r"./data/PUBHEALTH/train.tsv", sep='\t')
test_corpus = pd.read_csv(r"./data/PUBHEALTH/test.tsv", sep='\t')
dev_corpus = pd.read_csv(r"./data/PUBHEALTH/dev.tsv", sep='\t')

In [4]:
train_corpus.head()

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects
0,15661,"""The money the Clinton Foundation took from fr...","April 26, 2015","""Gingrich said the Clinton Foundation """"took m...",Katie Sanders,"""Hillary Clinton is in the political crosshair...",https://www.wsj.com/articles/clinton-foundatio...,false,"Foreign Policy, PunditFact, Newt Gingrich,"
1,9893,Annual Mammograms May Have More False-Positives,"October 18, 2011",This article reports on the results of a study...,,While the financial costs of screening mammogr...,,mixture,"Screening,WebMD,women's health"
2,11358,SBRT Offers Prostate Cancer Patients High Canc...,"September 28, 2016",This news release describes five-year outcomes...,"Mary Chris Jaklevic,Steven J. Atlas, MD, MPH,K...",The news release quotes lead researcher Robert...,https://www.healthnewsreview.org/wp-content/up...,mixture,"Association/Society news release,Cancer"
3,10166,"Study: Vaccine for Breast, Ovarian Cancer Has ...","November 8, 2011","While the story does many things well, the ove...",,"The story does discuss costs, but the framing ...",http://clinicaltrials.gov/ct2/results?term=can...,true,"Cancer,WebMD,women's health"
4,11276,Some appendicitis cases may not require ’emerg...,"September 20, 2010",We really don’t understand why only a handful ...,,"""Although the story didn’t cite the cost of ap...",,true,


In [5]:
test_corpus.head()

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects
0,33456,A mother revealed to her child in a letter aft...,"November 6, 2011",The one-eyed mother story expounds upon two mo...,David Mikkelson,"In April 2005, we spotted a tearjerker on the ...",,false,Glurge Gallery
1,2542,Study says too many Americans still drink too ...,"February 25, 2013","On any given day in the United States, 18 perc...",,That means the great majority of Americans sta...,http://bit.ly/X1NVtW,true,Health News
2,26678,Viral image Says 80% of novel coronavirus case...,"March 13, 2020",The website Information is Beautiful published...,Paul Specht,"Amid the spread of the novel coronavirus, many...",https://www.facebook.com/informationisbeautifu...,true,"Facebook Fact-checks, Coronavirus, Viral image,"
3,40705,An email says that 9-year old Craig Shergold o...,"March 16, 2015",Send greeting or business cards to cancer vict...,Rich Buhler & Staff,Craig Shergold is real and in 1989...,https://www.reddit.com/submit?url=https%3A%2F%...,false,"Inspirational, Pleas"
4,35718,"Employees at a Five Guys restaurant in Daphne,...","July 15, 2020","What's undetermined: As of this writing, Five ...",Dan MacGuill,"In July 2020, amid a new wave of nationwide pr...",,unproven,Law Enforcement


In [38]:
dev_corpus.head()

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects
0,34656,A baby died at an unnamed medical facility be...,"November 10, 2015",Fellow Twitter users suggested @FierceFemtivis...,Kim LaCapria,"On 8 November 2015, former Twitter user @Fierc...",http://webcache.googleusercontent.com/search?q...,unproven,"Politics, fiercefemtivist, racism"
1,3632,Bat from Shawnee County tests positive for rab...,,A bat found in northeastern Kansas has tested ...,,Topeka television station KSNT reports that th...,https://www.ksnt.com/news/bat-tests-positive-f...,true,"Rabies, Health, General News, Kansas, Bats, To..."
2,29558,Germany has banned pork from school canteens b...,"March 7, 2016",What's true: Some politicians complained that ...,Kim LaCapria,"On 7 March 2016, British tabloid Express repor...",http://bnp.org.uk/news/regional/bnp-victory-br...,false,Politics
3,8416,Coronavirus prompts Canada to roll out safe dr...,"April 16, 2020",Canada’s Pacific province of British Columbia ...,Tessa Vikander,"In March, the Canadian government urged provin...",,true,Health News
4,7169,"Wayne National Forest plans fires for tree, wi...",,"Nearly 2,000 acres of Wayne National Forest in...",,Forest officials say scientists who study nati...,,true,"Plants, Wildlife, Health, Wildlife health, For..."


In [11]:
dev_corpus.head()

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects
0,34656,A baby died at an unnamed medical facility be...,"November 10, 2015",Fellow Twitter users suggested @FierceFemtivis...,Kim LaCapria,"On 8 November 2015, former Twitter user @Fierc...",http://webcache.googleusercontent.com/search?q...,unproven,"Politics, fiercefemtivist, racism"
1,3632,Bat from Shawnee County tests positive for rab...,,A bat found in northeastern Kansas has tested ...,,Topeka television station KSNT reports that th...,https://www.ksnt.com/news/bat-tests-positive-f...,true,"Rabies, Health, General News, Kansas, Bats, To..."
2,29558,Germany has banned pork from school canteens b...,"March 7, 2016",What's true: Some politicians complained that ...,Kim LaCapria,"On 7 March 2016, British tabloid Express repor...",http://bnp.org.uk/news/regional/bnp-victory-br...,false,Politics
3,8416,Coronavirus prompts Canada to roll out safe dr...,"April 16, 2020",Canada’s Pacific province of British Columbia ...,Tessa Vikander,"In March, the Canadian government urged provin...",,true,Health News
4,7169,"Wayne National Forest plans fires for tree, wi...",,"Nearly 2,000 acres of Wayne National Forest in...",,Forest officials say scientists who study nati...,,true,"Plants, Wildlife, Health, Wildlife health, For..."


In [9]:
train_corpus = pd.concat([train_corpus, dev_corpus], ignore_index=True)

In [12]:
train_corpus.shape

(11053, 9)

In [13]:
fact_categories = ['false', 'mixture', 'true', 'unproven']

def clean_PUBHEALTH_data(corpus):
    '''Removes unnecessary columns and rows from the PUBHEALTH dataset'''

    corpus.drop(['fact_checkers', 'claim_id', 'sources', 'date_published', 'explanation'], axis=1, inplace=True)
    corpus.dropna(inplace=True)
    corpus = corpus.drop_duplicates()
    corpus['label'] = corpus['label'].str.lower().str.strip()
    corpus = corpus[corpus['label'].isin(fact_categories)]

    return corpus

def filter_PUBHEALTH_medical_data(cleanedCorpus):
    '''Filters the PUBHEALTH dataset to only include medical claims, and removes unnecessary columns'''
    medical_data = cleanedCorpus[-cleanedCorpus['subjects'].str.lower().str.contains(
                                    'foreign policy|political|social|economy|religion|religious|culture|sport|sports|entertainment|other')]
    medical_data.drop(['subjects'], axis=1, inplace=True)
    return medical_data
    
    

In [14]:
train_corpus = clean_PUBHEALTH_data(train_corpus)
test_corpus = clean_PUBHEALTH_data(test_corpus)
medical_data_train = filter_PUBHEALTH_medical_data(train_corpus)
medical_data_test = filter_PUBHEALTH_medical_data(test_corpus)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medical_data.drop(['subjects'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medical_data.drop(['subjects'], axis=1, inplace=True)


In [15]:
medical_data_train

Unnamed: 0,claim,main_text,label
1,Annual Mammograms May Have More False-Positives,While the financial costs of screening mammogr...,mixture
2,SBRT Offers Prostate Cancer Patients High Canc...,The news release quotes lead researcher Robert...,mixture
3,"Study: Vaccine for Breast, Ovarian Cancer Has ...","The story does discuss costs, but the framing ...",true
4,Some appendicitis cases may not require ’emerg...,"""Although the story didn’t cite the cost of ap...",true
5,Britain to reveal trial criteria for coronavir...,Antibody tests show whether whether people hav...,true
...,...,...,...
11046,A social media post correctly contextualizes t...,"In August 2019, a Facebook post about the Unit...",false
11049,"The state of Florida has seen ""double-digit dr...",As the Legislature considers changing rules fo...,true
11050,Officials probe respiratory illness at Quincy ...,The (Quincy) Herald-Whig reports the departmen...,true
11051,Sylentis announces the results of tivanisiran ...,There is no discussion of the potential cost o...,mixture


In [17]:
# medical_data_train.to_csv(r"./data/PUBHEALTH/medical_data_cleaned_train.tsv", sep='\t', index=False)
# medical_data_test.to_csv(r"./data/PUBHEALTH/medical_data_cleaned_test.tsv", sep='\t', index=False)

In [18]:
# Load datasets
medical_data_train = pd.read_csv(r"./data/PUBHEALTH/medical_data_cleaned_train.tsv", sep='\t')
medical_data_test = pd.read_csv(r"./data/PUBHEALTH/medical_data_cleaned_test.tsv", sep='\t')

In [19]:
input_variables = ['claim', 'main_text']
output_variables = ['label']

In [20]:
def labelEncodeCategories(dataframe_series):
    '''Label encodes the categories in the dataframe series'''
    Encoder = LabelEncoder()
    dataframe_series = Encoder.fit_transform(dataframe_series)
    return dataframe_series

In [21]:
medical_data_train['label'] = labelEncodeCategories(medical_data_train['label'])
medical_data_test['label'] = labelEncodeCategories(medical_data_test['label'])

In [22]:
def performTextPreprocessing(dataframe_series):
    '''
    This function performs text preprocessing on the dataframe series. It do the following tasks:
    1. Tokenize the text
    2. Remove stop words
    3. Lemmatize the text, and returns the dataframe series, containing lemmatized text in form of a space seperated string
    '''
    dataframe_series = dataframe_series.str.lower()
    
    # Word tokenization
    dataframe_series = [word_tokenize(text) for text in dataframe_series]

    # Remove stop words and non-alphanumeric characters (TODO: -- check how much do we need alphanumeric characters)
    stop_words = set(stopwords.words('english'))
    dataframe_series = [[word for word in text if word not in stop_words and word.isalpha()] for text in dataframe_series]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    for index, text in enumerate(dataframe_series):
        sentence = ""
        for word in text:
            word = lemmatizer.lemmatize(word)
            sentence += word + " "
        sentence = sentence.strip()
        dataframe_series[index] = sentence
    return dataframe_series

In [23]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

def perform_tfidf_vectorization(preprocessed_dataframe_series, tfidf_vectorizer):
    ''' Performs tfidf vectorization on the preprocessed dataframe series'''
    return tfidf_vectorizer.fit_transform(preprocessed_dataframe_series)

In [24]:
medical_data_train['claim'] = performTextPreprocessing(medical_data_train['claim'])
claim_data_train_vectorizer = perform_tfidf_vectorization(medical_data_train['claim'], tfidf_vectorizer)
claim_data_train_vectorizer.shape

(10440, 5000)

In [25]:
medical_data_train['main_text'] = performTextPreprocessing(medical_data_train['main_text'])
main_text_data_train_vectorizer = perform_tfidf_vectorization(medical_data_train['main_text'], tfidf_vectorizer)
main_text_data_train_vectorizer.shape

(10440, 5000)

In [26]:
medical_data_train.head()

Unnamed: 0,claim,main_text,label
0,annual mammogram may,financial cost screening mammography recall bi...,1
1,sbrt offer prostate cancer patient high cancer...,news release quote lead researcher robert meie...,1
2,study vaccine breast ovarian cancer potential,story discus cost framing problematic story ba...,2
3,appendicitis case may require emergency surgery,although story cite cost appendectomy emergenc...,2
4,britain reveal trial criterion coronavirus ant...,antibody test show whether whether people infe...,2


In [27]:
# Combining multiple text columns into one
appended_X = hstack((claim_data_train_vectorizer, main_text_data_train_vectorizer))
X_df = pd.DataFrame(appended_X.toarray())
X_df.shape

(10440, 10000)

In [28]:
# Form text preprocessing for test data
medical_data_test['label'] = labelEncodeCategories(medical_data_test['label'])
medical_data_test['claim'] = performTextPreprocessing(medical_data_test['claim'])
claim_data_test_vectorizer = perform_tfidf_vectorization(medical_data_test['claim'], tfidf_vectorizer)
medical_data_test['main_text'] = performTextPreprocessing(medical_data_test['main_text'])
main_text_data_test_vectorizer = perform_tfidf_vectorization(medical_data_test['main_text'], tfidf_vectorizer)


In [40]:
pca = PCA(n_components=5).fit(X_df)
data3D = pca.transform(X_df)

In [41]:
data3D

array([[ 0.15204446, -0.03941411,  0.01891832,  0.19795512,  0.0389014 ],
       [ 0.37782425,  0.00929996,  0.37288002,  0.10423492, -0.04338229],
       [ 0.54710523,  0.09148529,  0.11066959,  0.26389375,  0.14920142],
       ...,
       [-0.13912613,  0.12945496,  0.00720397, -0.03249046,  0.05043963],
       [ 0.15808699,  0.01551904,  0.10261111, -0.10738241, -0.15849623],
       [-0.06001102, -0.01132359,  0.02471988, -0.01695143, -0.10191605]])

In [42]:
plt.scatter(data3D[:,0], data3D[:,1], data3D[:,2], data3D[:,3], data3D[:,4], c=medical_data_train['label'], cmap='rainbow')

TypeError: scatter() got multiple values for argument 'c'

### Multinomial Naive Bayes Classifier

In [64]:
multinomial_nb = naive_bayes.MultinomialNB()
multinomial_nb.fit(X_df,  medical_data_train['label'])

In [65]:
multinomial_nb.score(X_df,  medical_data_train['label'])

0.7189957978666092

### SVM Classifier

In [163]:
svm_classifier = svm.SVC()
svm_classifier.fit(X_df,  medical_data_train['label'])

In [None]:
svm_classifier.score(X_df,  medical_data_train['label'])