In [55]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk  import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from scipy.sparse import hstack

In [36]:
# Load datasets
medical_data_train = pd.read_csv(r"./data/PUBHEALTH/medical_data_cleaned_train.tsv", sep='\t')
medical_data_test = pd.read_csv(r"./data/PUBHEALTH/medical_data_cleaned_test.tsv", sep='\t')

In [37]:
medical_data_train.head()

Unnamed: 0,claim,main_text,label
0,Annual Mammograms May Have More False-Positives,While the financial costs of screening mammogr...,mixture
1,SBRT Offers Prostate Cancer Patients High Canc...,The news release quotes lead researcher Robert...,mixture
2,"Study: Vaccine for Breast, Ovarian Cancer Has ...","The story does discuss costs, but the framing ...",true
3,Some appendicitis cases may not require ’emerg...,"""Although the story didn’t cite the cost of ap...",true
4,Britain to reveal trial criteria for coronavir...,Antibody tests show whether whether people hav...,true


In [38]:
# Merge claim and main_text columns into one column named 'claim'
medical_data_train['claim'] = medical_data_train['claim'] + '. ' + medical_data_train['main_text']
medical_data_test['claim'] = medical_data_test['claim'] + '. ' + medical_data_test['main_text']
medical_data_train.drop(columns=['main_text'], inplace=True)
medical_data_test.drop(columns=['main_text'], inplace=True)
medical_data_train.shape

(10440, 2)

In [39]:
medical_data_train.head()

Unnamed: 0,claim,label
0,Annual Mammograms May Have More False-Positive...,mixture
1,SBRT Offers Prostate Cancer Patients High Canc...,mixture
2,"Study: Vaccine for Breast, Ovarian Cancer Has ...",true
3,Some appendicitis cases may not require ’emerg...,true
4,Britain to reveal trial criteria for coronavir...,true


In [40]:
def labelEncodeCategories(dataframe_series):
    '''Label encodes the categories in the dataframe series'''
    Encoder = LabelEncoder()
    dataframe_series = Encoder.fit_transform(dataframe_series)
    return dataframe_series

In [41]:
medical_data_train['label'] = labelEncodeCategories(medical_data_train['label'])
medical_data_test['label'] = labelEncodeCategories(medical_data_test['label'])

In [42]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [43]:
medical_data_train['claim'].head(5).apply(text_process)

0    [Annual, Mammograms, May, FalsePositives, fina...
1    [SBRT, Offers, Prostate, Cancer, Patients, Hig...
2    [Study, Vaccine, Breast, Ovarian, Cancer, Pote...
3    [appendicitis, cases, may, require, ’emergency...
4    [Britain, reveal, trial, criteria, coronavirus...
Name: claim, dtype: object

In [44]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

def perform_tfidf_vectorization(preprocessed_dataframe_series, tfidf_vectorizer):
    ''' Performs tfidf vectorization on the preprocessed dataframe series'''
    return tfidf_vectorizer.fit_transform(preprocessed_dataframe_series)

In [45]:
claim_data_train_vectorizer = perform_tfidf_vectorization(medical_data_train['claim'], tfidf_vectorizer)
claim_data_train_vectorizer.shape

(10440, 5000)

In [46]:
medical_data_train.head()

Unnamed: 0,claim,label
0,Annual Mammograms May Have More False-Positive...,1
1,SBRT Offers Prostate Cancer Patients High Canc...,1
2,"Study: Vaccine for Breast, Ovarian Cancer Has ...",2
3,Some appendicitis cases may not require ’emerg...,2
4,Britain to reveal trial criteria for coronavir...,2


In [47]:
medical_data_test['claim'].head(5).apply(text_process)

0    [mother, revealed, child, letter, death, one, ...
1    [Study, says, many, Americans, still, drink, m...
2    [Viral, image, Says, 80, novel, coronavirus, c...
3    [email, says, 9year, old, Craig, Shergold, Car...
4    [Employees, Five, Guys, restaurant, Daphne, Al...
Name: claim, dtype: object

In [48]:
claim_data_test_vectorizer = perform_tfidf_vectorization(medical_data_test['claim'], tfidf_vectorizer)
claim_data_test_vectorizer.shape

(1173, 5000)

In [49]:
medical_data_test.head()

Unnamed: 0,claim,label
0,A mother revealed to her child in a letter aft...,0
1,Study says too many Americans still drink too ...,2
2,Viral image Says 80% of novel coronavirus case...,2
3,An email says that 9-year old Craig Shergold o...,0
4,"Employees at a Five Guys restaurant in Daphne,...",3


In [50]:
model = PassiveAggressiveClassifier()
model.fit(claim_data_train_vectorizer, medical_data_train['label'])

PassiveAggressiveClassifier()

In [51]:
model.score(claim_data_train_vectorizer, medical_data_train['label'])

0.9935823754789272

In [53]:
predictions = model.predict(claim_data_test_vectorizer)

In [56]:
print(classification_report(claim_data_test_vectorizer,medical_data_test['label']))

TypeError: len() of unsized object

In [12]:
class PassiveAggressiveModel: 

    def __init__(self) -> None:
        pass

    def text_processing_pipeline(self):
        print("Processing Pipeline")

    def train(self):
        print("Train")
    
    def test(self):
        print("Test")

    def predict(self):
        print("Predict")

    def save_model(self):
        print("Saving Model")

In [13]:
model = PassiveAggressiveModel()
model.save_model()

Saving Model
