In [39]:
import numpy as np
import os
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


In [40]:
DATASET_PATH = "./datasets"

def load_covid_data(filename, dataset_path=DATASET_PATH):
    xlsx_path = os.path.join(dataset_path, filename)
    return pd.read_excel(xlsx_path)

In [41]:
def convert_age(age):
  if type(age) == str or age == None or math.isnan(age):
    return 1
  else:
    return int(age)

def parse_travel_history(travel_history):
  if type(travel_history) == str and travel_history.startswith('n'):
    return True
  return False


def parse_public_transport(public_transport):
  if type(public_transport) == str:
    public_transport = public_transport.lower()
    if "masina" in public_transport or public_transport.startswith("n"):
      return False
    return True
  return False

def parse_result(result):
  if type(result) == str:
    result = result.lower()
    if result.startswith("n"):
      return False
    return True
  return False

def parse_contact(contact):
  if type(contact) == str:
    contact = contact.lower()
    if "tie" in contact:
      return 2
    elif "nu" in contact or "neag" in contact:
      return 0
    return 1
  return 0


def parse_symptoms(symptoms):

  if type(symptoms) is not str:
    return ''

  rv = dict() 
  symptoms = symptoms.lower()
  
  rv['febra'] = 'febr' in symptoms or 'temp' in symptoms
  rv['tuse'] = 'tuse' in symptoms
  rv['dispnee'] = 'dispnee' in symptoms
  rv['fatigabilitate'] = 'fatigabilitate' in symptoms or 'astenie' in symptoms
  rv['diaree'] = 'diaree' in symptoms
  rv['dureri'] = 'dur' in symptoms
  rv['simturi'] = 'gust' in symptoms or 'miros' in symptoms or 'simt' in symptoms
  rv['greturi'] = 'gre' in symptoms or 'vars' in symptoms

  return rv

def parse_suspect(suspect):

  if type(suspect) is not str:
    return 2

  if 'sus' in suspect or 'cov' in suspect:
    return 1

  return 0

full_data = load_covid_data("mps.dataset.xlsx")
full_data.rename(columns={'vârstă': 'age'}, inplace=True)
full_data.rename(columns={'istoric de călătorie': 'travel_history'}, inplace=True)
full_data.rename(columns={'mijloace de transport folosite': 'public_transport'}, inplace=True)
full_data.rename(columns={'confirmare contact cu o persoană infectată': 'contact'}, inplace=True)
full_data.rename(columns={'rezultat testare': 'result'}, inplace=True)
full_data.rename(columns={'simptome raportate la internare': 'symptoms'}, inplace=True)
full_data.rename(columns={'diagnostic și semne de internare': 'suspect'}, inplace=True)


full_data.replace(['F','FEMININ     ','FEMININ'],'FEMININ', inplace=True)
full_data.replace(['masculin'],'MASCULIN', inplace=True)
full_data = full_data[full_data['sex'].notna()]
full_data['age'] = full_data['age'].map(convert_age)
full_data = full_data[full_data['age'] < 100]
full_data = full_data[full_data['result'] != 'NECONCLUDENT']
full_data['travel_history'] = full_data['travel_history'].map(parse_travel_history)
full_data['public_transport'] = full_data['public_transport'].map(parse_public_transport)
full_data['result'] = full_data['result'].map(parse_result)
full_data['contact'] = full_data['contact'].map(parse_contact)
full_data['simptome declarate'] = full_data['simptome declarate'].map(str)
full_data['symptoms'] = full_data['symptoms'].astype(str) + ' ' +full_data['simptome declarate']
full_data['symptoms'] = full_data['symptoms'].map(parse_symptoms)
full_data['suspect'] = full_data['suspect'].map(parse_suspect)

full_data = pd.concat([full_data, full_data['symptoms'].apply(pd.Series)], axis = 1).drop('symptoms', axis = 1)


_ = full_data.pop('instituția sursă')
_ = full_data.pop('dată debut simptome declarate')
_ = full_data.pop('dată internare')
_ = full_data.pop('data rezultat testare')
_ = full_data.pop('simptome declarate')


In [43]:
full_data.pop('sex')
full_data.pop('age')
X = full_data
y = full_data.pop('result')

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

rf = RandomForestClassifier(n_estimators=100, random_state=123456)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123456,
                       verbose=0, warm_start=False)

In [45]:
predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.908
