In [18]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk  import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.sparse import hstack
import pickle

In [2]:
np.random.seed(500)

In [12]:
class TrainMLModels:
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)

    def __performTextPreprocessing_PUBHEALTH(self, dataframe_series):
        '''
        This function performs text preprocessing on the dataframe series. It do the following tasks:
        1. Tokenize the text
        2. Remove stop words
        3. Lemmatize the text, and returns the dataframe series, containing lemmatized text in form of a space seperated string
        '''
        dataframe_series = dataframe_series.str.lower()
        
        # Word tokenization
        dataframe_series = [word_tokenize(text) for text in dataframe_series]

        # Remove stop words and non-alphanumeric characters (TODO: -- check how much do we need alphanumeric characters)
        stop_words = set(stopwords.words('english'))
        dataframe_series = [[word for word in text if word not in stop_words and word.isalpha()] for text in dataframe_series]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        for index, text in enumerate(dataframe_series):
            sentence = ""
            for word in text:
                word = lemmatizer.lemmatize(word)
                sentence += word + " "
            sentence = sentence.strip()
            dataframe_series[index] = sentence
        return dataframe_series
    

    def __perform_tfidf_vectorization_PUBHEALTH(self, preprocessed_dataframe_series):
        ''' Performs tfidf vectorization on the preprocessed dataframe series'''
        return self.tfidf_vectorizer.fit_transform(preprocessed_dataframe_series)
    
    def __generate_claim_label_mapping(self, dataframe_series):
        possible_labels = dataframe_series.unique()
        label_dict = {}
        for index, possible_label in enumerate(possible_labels):
            label_dict[possible_label] = index
        return label_dict
    
    def get_train_test_Variables(self):
        '''Preform all data preprocessing tasks and return x_train, y_train, x_test, y_test, y_text_to_integer_mapping'''
        medical_data_train = pd.read_csv(r"./data/PUBHEALTH/medical_data_cleaned_train.tsv", sep='\t')
        medical_data_test = pd.read_csv(r"./data/PUBHEALTH/medical_data_cleaned_test.tsv", sep='\t')
        
        # Merge claim and main_text columns into single one
        medical_data_train['claim'] = medical_data_train['claim'] + '. ' + medical_data_train['main_text']
        medical_data_test['claim'] = medical_data_test['claim'] + '. ' + medical_data_test['main_text']
        medical_data_train.drop(columns=['main_text'], inplace=True)
        medical_data_test.drop(columns=['main_text'], inplace=True)
        
        # Label encode claim label categories
        label_dict = self.__generate_claim_label_mapping(medical_data_train['label'])
        y_train = medical_data_train['label'].replace(label_dict)
        y_test = medical_data_test['label'].replace(label_dict)


        # Perform text preprocessing
        medical_data_train['claim'] = self.__performTextPreprocessing_PUBHEALTH(
            medical_data_train['claim']
        )
        medical_data_test['claim'] = self.__performTextPreprocessing_PUBHEALTH(
            medical_data_test['claim']
        )

        # Perform tfidf vectorization
        x_train = self.__perform_tfidf_vectorization_PUBHEALTH(medical_data_train['claim'])
        x_test = self.__perform_tfidf_vectorization_PUBHEALTH(medical_data_test['claim'])
        return (x_train, y_train, x_test, y_test, label_dict)

In [13]:
model_trainer = TrainMLModels()
data = model_trainer.get_train_test_Variables()

In [14]:
x_train = data[0]
y_train = data[1]
x_test = data[2]
y_test = data[3]
label_dict = data[4]

### Multinomial Naive Bayes classifier

In [15]:
multinomial_nb = naive_bayes.MultinomialNB()
multinomial_nb.fit(x_train,  y_train)

In [16]:
multinomial_nb.score(x_train, y_train)

0.6618773946360154

In [17]:
multinomial_nb.score(x_test, y_test)

0.3734015345268542

In [19]:
pickle.dump(multinomial_nb, open("multinomial_nb_classifier", "wb"))

### SVM Classifier

In [20]:
svm_classifier = svm.SVC()
svm_classifier.fit(x_train, y_train)

In [21]:
svm_classifier.score(x_train, y_train)

0.9153256704980843

In [22]:
svm_classifier.score(x_test, y_test)

0.4680306905370844

In [23]:
pickle.dump(svm_classifier, open("svm_classifier", "wb"))