# Load libraries needed

In [None]:
#Pandas
import pandas as pd
from pandas import ExcelWriter, ExcelFile

#Numpy
import numpy as np

#NLTK
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop = stopwords.words('english')
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

#SkLearn
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

#Other
import string
import seaborn as sns
import yellowbrick as yb
from yellowbrick.classifier import ClassificationReport


df = pd.read_excel('docs/list_project_updated.xlsx')

In [None]:
#Check to make sure we have the columns and it imported 
print(df.columns)

#The categorical variables needed to one hot encode
cat_var = [key for key in dict(df.dtypes) if dict(df.dtypes)[key] in ['object']] 

#Do not want to hot encode these.  
cat_var.remove('CHD_OTHSP')
cat_var.remove('SPECOTH')

#show the categorical
print(cat_var)

# Prepare Data 

In [None]:
#This unnamed column gets added as an index from importing with pandas, not sure how to drop it in the 
#import so just dropping it here
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)

#Looking at the data specs
print(df.shape)
print(df.head())
print(df.describe())

In [None]:
#One hot encode everything
df_processed = pd.get_dummies(df, prefix_sep="_",columns=cat_var)

#Text variables to drop from the first model
text = [key for key in dict(df_processed.dtypes) if dict(df_processed.dtypes)[key] in ['object']] 

#These are the text variables now because we transformed the others
print(text)

#Storing all continuous variables
con_var = [key for key in dict(df_processed.dtypes) if dict(df_processed.dtypes)[key] not in ['object']] 

#We don't want this in any of the models, it's useless information
con_var.remove('PATIENT_ID')

#Look at how many people are flagged as Heterotaxy to ensure it was the amount Tobias thought

print(df_processed['HETEROTAXY'].value_counts())

#Look at the data
df_processed


In [None]:
#Model without text fields
#Drop the text fields
df_model1 = df_processed.drop(text,axis=1)


In [None]:
df_model1

#Look at missing
def missing(dff):
    print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))

#This is an issue
missing(df_model1)


In [None]:
#Impute with the median, I just randomly chose this...could do whatever.
df_imp = df_model1.fillna(df_model1.median())

df_imp

#Huge class imbalacnce
sns.set(font_scale=1.5)
countplt=sns.countplot(x='HETEROTAXY', data=df_imp, palette ='hls')
plt.show()


# Prepare Data with Test Train Split

In [None]:
#Everything but the predictor
cols = [col for col in df_imp.columns if col not in ['HETEROTAXY']]

#The data with all columns but target
data = df_imp[cols]

#The predictor
target = df_imp['HETEROTAXY']

#Split the data
data_train, data_test, target_train, target_test = train_test_split(data,target, shuffle=False, test_size = 0.20)

#Print dimensions
print(data_train.shape)
print(data_test.shape)
print(target_train.shape)
print(target_test.shape)


# Naive Bayes - NEED TO UPDATE

In [None]:

#create an object of the type GaussianNB
gnb = GaussianNB()

#train the algorithm on training data and predict using the testing data
pred = gnb.fit(data_train, target_train).predict(data_test)

#print the accuracy score of the model
print("Naive-Bayes accuracy : ",accuracy_score(target_test, pred, normalize = True))

#NEED TO FIX!!!


# SVC 

In [None]:
#create an object of type LinearSVC
svc_model = LinearSVC(random_state=0)

#train the algorithm on training data and predict using the testing data
pred = svc_model.fit(data_train, target_train).predict(data_test)

#print the accuracy score of the model
print("LinearSVC accuracy : ",accuracy_score(target_test, pred, normalize = True))

#KEEP GETTING CONVERGENCE ERROR

# Logistic Regression

In [None]:

#create an object of type LinearSVC
logit = LogisticRegression(max_iter=1000, solver='lbfgs')

#train the algorithm on training data and predict using the testing data
pred = logit.fit(data_train, target_train).predict(data_test)

#print the accuracy score of the model
print("Logistic Regression : ",accuracy_score(target_test, pred, normalize = True))

#KEEP GETTING CONVERGENCE ERROR

# Adding first text field

In [None]:
#Just drop SPECOTH as we are adding the first text field
df_model2 = df_processed.drop('SPECOTH',axis=1)

#Turn missing into blanks
df_model2["CHD_OTHSP"] = df_model2.CHD_OTHSP.fillna('')

# #Remove punctuation
df_model2['CHD_OTHSP'].str.replace('[{}]'.format(string.punctuation), '')

#This removes stop words, transforms to lowercase, tokenizes, and calculated TFIDF
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df_model2['CHD_OTHSP'])

temp = pd.DataFrame(x.toarray(),columns=tfidf.get_feature_names())

#Concat the results with the original dataframe
df_model2_final = pd.concat([df_model2, temp], axis=1)

#NOw that we transformed it, we don;t need it anymore
df_model_2 = df_model2_final.drop("CHD_OTHSP",axis=1)

#Checking
print(df_model_2.columns.values.tolist())


In [None]:
#Impute with the median, I just randomly chose this...could do whatever.
df_imp2 = df_model_2.fillna(df_model_2.median())

#Everything but the predictor
cols_2 = [col for col in df_imp2.columns if col not in ['HETEROTAXY']]

#The data with all columns but target
data_2 = df_imp2[cols_2]

#The predictor
target_2 = df_imp2['HETEROTAXY']

#Split the data
data_train_2, data_test_2, target_train_2, target_test_2 = train_test_split(data_2,target_2, test_size = 0.20)

df_imp2.dtypes.value_counts()

#Print dimensions
print(data_train_2.shape)
print(data_test_2.shape)
print(target_train_2.shape)
print(target_test_2.shape)

# #train the algorithm on training data and predict using the testing data
pred2 = logit.fit(data_train_2, target_train_2).predict(data_test_2)

#print the accuracy score of the model
print("Logistic Regression : ",accuracy_score(target_test_2, pred2, normalize = True))
