In [5]:
import numpy as np
from datascience import *

In [6]:
### COVID-19 PREDICTION TOOL ###
### Developer: Vedansh Malhotra ###
### Sources: CSV file taken from https://github.com/nshomron/covidpred/blob/master/data/corona_tested_individuals_ver_006.english.csv.zip ###
###          CSV file originally sourced from: https://data.gov.il/dataset/covid-19 ###
#--------------------------------------------------------------------------------------------------------------------------------------------#

# To classify a person, create a cell at the bottom of the notebook and run classify(<information list>) #
# The information list is a list containing 0s and 1s, corresponding to No/ Yes, in the following order: #

# cough || fever || sore_throat || shortness_of_breath || head_ache || age_60_and_above || sex(1: Male, 0: Female) #

# For instance, if a female under the age of 60 has cough, sore throat, and head ache, they would be classified using: #
# classify([1, 0, 1, 0, 1, 0, 0]) #

In [7]:
# CSV file containing test-results is converted to a table. #

raw_data = Table.read_table("COVID_DataSet.csv")
raw_data.show(10)

test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
2020-04-30,0,0,0,0,0,negative,,female,Other
2020-04-30,1,0,0,0,0,negative,,female,Other
2020-04-30,0,1,0,0,0,negative,,male,Other
2020-04-30,1,0,0,0,0,negative,,female,Other
2020-04-30,1,0,0,0,0,negative,,male,Other
2020-04-30,1,0,0,0,0,negative,,female,Other
2020-04-30,1,1,0,0,0,negative,,male,Abroad
2020-04-30,0,0,0,0,0,negative,,female,Other
2020-04-30,0,0,0,0,0,negative,,male,Other
2020-04-30,0,0,0,0,0,negative,,male,Contact with confirmed


In [8]:
# Unnecessary columns are first dropped. #
filtered = raw_data.drop("test_date", "test_indication")
filtered.show(10)

cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender
0,0,0,0,0,negative,,female
1,0,0,0,0,negative,,female
0,1,0,0,0,negative,,male
1,0,0,0,0,negative,,female
1,0,0,0,0,negative,,male
1,0,0,0,0,negative,,female
1,1,0,0,0,negative,,male
0,0,0,0,0,negative,,female
0,0,0,0,0,negative,,male
0,0,0,0,0,negative,,male


In [9]:
# The data are now randomly shuffled precautionarily, to remove potential systematic ordering. #
shuffled = filtered.sample(with_replacement=False)
shuffled.show(10)

cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender
0,0,0,0,0,negative,,male
0,0,0,0,0,negative,No,male
0,0,0,0,0,negative,,female
0,0,0,0,0,negative,No,male
1,0,0,0,0,negative,No,female
0,0,0,0,0,negative,Yes,
0,1,0,0,0,negative,,male
0,0,0,0,0,negative,,male
1,1,0,0,0,negative,No,female
1,0,0,0,0,negative,,female


In [10]:
# 50,000 data points are initially chosen to train the classifier #
shortened = shuffled.take(range(50000))

In [11]:
# Some data points have illegible values such as 'None' or 'Nan', these are filtered out. #

clean = shortened.where("age_60_and_above", are.contained_in(["No", "Yes"]))
clean = clean.where("corona_result", are.contained_in(["positive", "negative"]))
clean = clean.where("gender", are.contained_in(["female", "male"]))
clean = clean.where("cough", are.contained_in(["0", "1"]))
clean = clean.where("fever", are.contained_in(["0", "1"]))
clean = clean.where("sore_throat", are.contained_in(["0", "1"]))
clean = clean.where("shortness_of_breath", are.contained_in(["0", "1"]))
clean = clean.where("head_ache", are.contained_in(["0", "1"]))

clean.show(10)

cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender
0,0,0,0,0,negative,No,male
0,0,0,0,0,negative,No,male
1,0,0,0,0,negative,No,female
1,1,0,0,0,negative,No,female
0,0,0,0,0,negative,No,female
1,1,1,0,1,positive,No,female
0,0,0,0,0,negative,No,male
0,0,0,0,0,negative,No,male
1,0,0,0,1,positive,No,female
0,0,0,0,0,negative,No,male


In [12]:
# To compute distances between various members of the population in the data sets, categorical values are converted to numerical. #

mapping_dict = {
    "negative" : 0,
    "positive" : 1,
    "No" : 0,
    "Yes" : 1,
    "female" : 0,
    "male" : 1
}
ages = np.array(clean.column("age_60_and_above"))
test_results = np.array(clean.column("corona_result"))
sexes = np.array(clean.column("gender"))
for i in range(clean.num_rows):
    ages[i] = mapping_dict[ages[i]]
    test_results[i] = mapping_dict[test_results[i]]
    sexes[i] = mapping_dict[sexes[i]]

clean = clean.with_columns("corona_result", test_results, "age_60_and_above", ages, "sex", sexes).drop('gender')
# Note that the term gender has been changed to sex
clean.show(10)

cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,sex
0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0
1,1,1,0,1,1,0,0
0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,1
1,0,0,0,1,1,0,0
0,0,0,0,0,0,0,1


In [39]:
# Here, the data are split into a training and a test set. This ensures that the accuracy of the classifier can be deduced- #
# The classifier is trained only with the training set, and is later tested with the testing set. #
# The numbers are chosen to split the data, approximately, into two equal halves. #

test_set = clean.take(range(0, clean.num_rows // 2))
train_set = clean.take(range(clean.num_rows // 2, clean.num_rows))

In [40]:
def table_to_int(table):
    """Returns a copy of table with all of its values converted to integers."""

    cols = table.labels
    new = table
    for col in cols:
        arr = np.array(table.column(col)).astype(int)
        new = new.with_column(col, arr)
    return new

In [41]:
# This is necessary since in the original table, numerical values for symptoms (0/1) were entered as strings; #
# To compute distances between points, strings must be converted to numbers #

train_set = table_to_int(train_set)
test_set = table_to_int(test_set)

In [42]:
def distance(new_data_point, training_data_table):
    """Returns a copy of the training data table, along with the Euclidian distances from the new data point to each point in the set."""
    distances = make_array()
    table_without_results = training_data_table.drop('corona_result')
    for i in range(training_data_table.num_rows):
        row_array = np.array(table_without_results.row(i))
        distance = np.sqrt(sum((row_array - new_data_point) ** 2))
        distances = np.append(distances, distance)
    return training_data_table.with_column('distances', distances)

In [43]:
def k_nearest_neighbours(new_data_point, k):
    """ Returns the 'k' nearest neighbours of a new data point in the training set """
    
    neighbours = distance(new_data_point, train_set)
    sorted_neighbours = neighbours.sort('distances',descending=False)
    k_sorted_neighbours = sorted_neighbours.take(range(k))
    return k_sorted_neighbours

In [44]:
def classify(new_data_point, k):
    """Returns the result (positive/nagative) of classifying one new data point using 'k' nearest neighbours."""
    
    nn = k_nearest_neighbours(new_data_point, k)
    results = nn.select('corona_result')
    counts = results.group('corona_result')
    sorted_counts = counts.sort('count', descending=True)
    result_key = sorted_counts.column(0).item(0)
    if result_key == 0:
        return 'NEGATIVE'
    return 'POSITIVE'

In [50]:
def accuracy(k=5, n=10):
    """Returns the accuracy (%) of the classifier with 'k' nearest neighbours, using 'n' members of the test set."""
    # Default values for k, n were chosen to accomodate lengthy processing times. #

    test_set_without_results = test_set.drop('corona_result').take(range(n)) 
    predictions = make_array()

    for i in range(n):
        prediction_string = classify(np.array(test_set_without_results.row(i)), k)
        if prediction_string == 'NEGATIVE':
            prediction_binary = 0
        else:
            prediction_binary = 1
        predictions = np.append(predictions, prediction_binary)

    comparison_table = test_set.take(range(n)).with_column('predicted_corona_result', predictions)
    correct_guesses = sum(comparison_table.column('predicted_corona_result') == comparison_table.column('corona_result'))
    return (correct_guesses / comparison_table.num_rows) * 100

In [52]:
# The classifier is 92.8% accurate when using 5 nearest neighbours with 1000 members of the testing set. #
print(accuracy(5, 1000))

92.80000000000001
