In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
from sklearn import preprocessing
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy import stats
from scipy.stats import linregress

In [None]:
#import dataset
train = pd.read_csv('~/Desktop/trainProcessed.csv')
validate = pd.read_csv('~/Desktop/validateProcessed.csv')
test = pd.read_csv('~/Desktop/testProcessed.csv')

In [None]:
import json

pathology_dict = json.load(open('/content/drive/MyDrive/release_conditions.json', 'r'))
evidences_dict = json.load(open('/content/drive/MyDrive/release_evidences.json', 'r'))

In [None]:
def check_typo(df):
    for n, i in df.iterrows():
        if i['PATHOLOGY'] not in pathology_dict.keys():
            print(f"{n} has a typo in pathology: {i['PATHOLOGY']}")
            df.drop[n]
        for evidence in i['EVIDENCES']:
            evidence_split = evidence.split('_@_')
            question = evidence_split[0]
            if question not in evidences_dict.keys():
                print(f'{n} has a typo in question: {evidence}')
                df.drop[n]
                continue
            if len(evidence_split) > 1:
                value = evidence_split[1]
                if value not in evidences_dict[question]["possible-values"] \
                    and value not in [str(e) for e in evidences_dict[question]["possible-values"]]:
                    print(f'{n} has a typo in value: {evidence}')
                    df.drop[n]

In [None]:
def one_hot_encode(df):
    unique_items = set(item for sublist in df['EVIDENCES'] for item in sublist)
    encoded_evidence = pd.DataFrame(0, index=df.index, columns=list(unique_items))
    for item in unique_items:
        encoded_evidence[item] = df['EVIDENCES'].apply(lambda x: item in x)
    return encoded_evidence.astype(int)

In [None]:
def readingData(df):
  # Get an overview of data
  print("DataFrame shape: "+str(df.shape))

  # Deal with Nan value and Duplicate
  print(df.isna().sum())
  df.dropna()
  print('Total Duplicate: '+str(df.duplicated().sum()))
  df.drop_duplicates()

  # Turn variable "EVIDENCES" from string to list
  evidences = []
  for evidence_row in df['EVIDENCES']:
    evidences.append([evidence.strip(" ''") for evidence in evidence_row.strip("[]").split(",") ])
  df['EVIDENCES'] = evidences

  # Drop Outlier
  df = df[(df['AGE'] <= 120) & (df['AGE'] >= 0)]

  # Check Typo
  check_typo(df)

  # Data Encoding
  df['SEX'] = df['SEX'].map({'M': 0, 'F': 1})
  encoded_df = one_hot_encode(df)
  df = df.drop('EVIDENCES', axis=1).join(encoded_df)
  print(df.head())

  return df

In [None]:
train=readingData(train)
validate=readingData(validate)
test=readingData(test)

In [None]:
all_columns = pd.Series(list(train.columns) + list(validate.columns) + list(test.columns)).drop_duplicates()
train = train.reindex(columns=all_columns, fill_value=0)
validate = validate.reindex(columns=all_columns, fill_value=0)
test = test.reindex(columns=all_columns, fill_value=0)