## Import Dependancies

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier, KDTree
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from wordcloud import WordCloud, STOPWORDS
#from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# File to Load (Remember to Change These)
data = "../Resources/FINAL_CLEAN_FILE.csv"

# read to dataframe
data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,VAERS_ID,SYMPTOM,ASSIGNED_GROUP,SEVERITY_LEVEL,DIED,L_THREAT,HOSPITAL,AGE_YRS,AGE_GROUP,OTHER_MEDS,...,VAX_SITE_RA,18-25,26-35,36-45,46-55,56-65,66-75,76-85,86-95,96 +
0,916600,Dysphagia,Difficulty swallowing,MILD,0.0,0.0,0.0,33.0,26-35,0.0,...,0.0,0,1,0,0,0,0,0,0,0
1,917168,Dysphagia,Difficulty swallowing,MILD,0.0,0.0,0.0,64.0,56-65,1.0,...,1.0,0,0,0,0,1,0,0,0,0
2,918386,Dysphagia,Difficulty swallowing,MILD,0.0,0.0,0.0,62.0,56-65,1.0,...,1.0,0,0,0,0,1,0,0,0,0
3,918393,Dysphagia,Difficulty swallowing,MILD,0.0,0.0,0.0,62.0,56-65,1.0,...,1.0,0,0,0,0,1,0,0,0,0
4,918479,Dysphagia,Difficulty swallowing,MILD,0.0,0.0,0.0,37.0,36-45,1.0,...,1.0,0,0,1,0,0,0,0,0,0


In [3]:
severity = data_df['SEVERITY_LEVEL'].unique()

## Prep Data For Model

In [4]:
# create train data
X = data_df.drop(['VAERS_ID', 'SYMPTOM','ASSIGNED_GROUP','SEVERITY_LEVEL','HOSPITAL','AGE_YRS', 'AGE_GROUP'], axis=1)
X

Unnamed: 0,DIED,L_THREAT,OTHER_MEDS,CUR_ILL,HISTORY,PRIOR_VAX,ALLERGIES,F,M,JANSSEN,...,VAX_SITE_RA,18-25,26-35,36-45,46-55,56-65,66-75,76-85,86-95,96 +
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0,1,0,0,0,0,0,0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0,0,0,0,1,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0,0,0,0,1,0,0,0,0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0,0,0,0,1,0,0,0,0
4,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26477,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
26478,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0,0,0,0,0,0,1,0,0
26479,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0,0,0,0,0,0,1,0,0
26480,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0,0,0,0,0,0,1,0,0


In [5]:
# convert the serverity labels to numbers and create the target data
y = LabelEncoder().fit_transform(data_df['SEVERITY_LEVEL'])
y

array([0, 0, 0, ..., 2, 1, 1])

In [6]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [7]:
# Scaling the train data with StandardScaler()
#scaler = StandardScaler().fit(X_train)
#X_train_scaled = scaler.transform(X_train)
#X_test_scaled = scaler.transform(X_test)

## Testing KNN to Determine Best k value

In [9]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy score")
plt.show()

k: 1, Train/Test Score: 0.877/0.837
k: 3, Train/Test Score: 0.899/0.884


KeyboardInterrupt: 

Based on the scores above, we will move forward with 5 neighbors for our model. This was run on 3 random states (33, 42, 7) and all began to stabilize at k=5. 

## Create KNN Model 

In [8]:
# Note that k: 5 provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
print('k=5 Test Acc: %.3f' % knn.score(X_test, y_test))

k=5 Test Acc: 0.837


In [9]:
X.columns

Index(['DIED', 'L_THREAT', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX',
       'ALLERGIES', 'F', 'M', 'JANSSEN', 'MODERNA', 'PFIZER',
       'VAX_DOSE_SERIES_1', 'VAX_DOSE_SERIES_2', 'VAX_SITE_LA', 'VAX_SITE_RA',
       '18-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76-85', '86-95',
       '96 +'],
      dtype='object')

### Test Prediction 1

In [10]:
# Sample Prediction 1
sample_user1 = [[0,0,0,1,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,1,1,1]]
predicted_symptoms = knn.predict(sample_user1)
print(severity[predicted_symptoms])

['MILD']


In [11]:
# Sample Prediction 2 (return severe)
sample_user3 = [[1,1,20,20,1,1,1,1,1,1,20,1,1,0,1,1,1,20,1,1,20,0,20,20,20]]
knn.predict(sample_user3)

array([0])

In [12]:
# 5 people closest to user 1
tree = KDTree(X)
dist, ind = tree.query(sample_user1, k=200)

#convery ndarry to list 
like_users = ind[0].tolist()
print(like_users)  # indices of 5 closest neighbors

[2740, 21250, 18347, 23274, 25111, 15525, 13401, 19349, 17492, 22047, 21926, 24703, 24719, 26180, 16270, 13821, 14586, 14086, 13847, 15361, 7859, 16705, 18704, 18995, 7856, 22692, 22146, 3683, 20175, 24090, 24706, 24707, 7622, 25964, 26218, 26248, 9613, 2668, 9105, 8895, 9090, 8940, 5802, 5811, 5820, 10556, 11017, 11347, 10553, 2755, 11510, 10444, 12597, 12666, 12466, 12442, 13089, 13224, 13094, 14651, 10713, 13086, 14346, 13617, 14798, 2088, 8949, 11738, 10709, 11908, 13085, 7806, 2741, 7848, 7896, 9868, 294, 2756, 9715, 9721, 3722, 10059, 7634, 7609, 5806, 11354, 5810, 5813, 2124, 10497, 11025, 11310, 11297, 11300, 11124, 10617, 599, 10433, 10568, 10596, 10510, 10229, 10400, 458, 6132, 12559, 12600, 12607, 12670, 12380, 5874, 12557, 13195, 13095, 604, 26480, 284, 13088, 69, 14600, 15321, 2061, 7625, 7853, 9598, 14409, 13424, 13616, 7575, 11355, 11071, 10405, 10364, 10489, 12740, 13084, 13191, 14620, 293, 13604, 7971, 16511, 7613, 3161, 7536, 7819, 7832, 7833, 7855, 7862, 6859, 9895, 

In [16]:
# save nearest 200 neighbors symptoms as list for wordcloud
predicted_symptoms = data_df[['ASSIGNED_GROUP','SEVERITY_LEVEL']].iloc[like_users]
predicted_symptoms = predicted_symptoms['ASSI'.value_counts()
predicted_symptoms.head()

ASSIGNED_GROUP                        SEVERITY_LEVEL
Weakness                              MILD              26
Joint Pain/Discomfort                 MILD              24
Injection Site Irritation/Discomfort  MILD              24
Chills                                MILD              17
Fever                                 MILD              17
dtype: int64

In [20]:
dirty_dict = predicted_symptoms.to_dict()
dirty_dict

{('Weakness', 'MILD'): 26,
 ('Joint Pain/Discomfort', 'MILD'): 24,
 ('Injection Site Irritation/Discomfort', 'MILD'): 24,
 ('Chills', 'MILD'): 17,
 ('Fever', 'MILD'): 17,
 ('Back pain', 'MILD'): 9,
 ('Respiratory distress', 'MODERATE'): 8,
 ('Itching', 'MILD'): 7,
 ('Migraine/Headache', 'MILD'): 6,
 ('Fatigue', 'MILD'): 5,
 ('Blood Pressure Change', 'MILD'): 5,
 ('Muscle Aches/Pain/Tightness', 'MILD'): 5,
 ('Nausea/Vomiting', 'MILD'): 4,
 ('Abdominal Pain/Discomfort', 'MILD'): 4,
 ('Chest Discomfort', 'MILD'): 4,
 ('Anxiety', 'MILD'): 4,
 ('Muscle Spasms', 'MILD'): 3,
 ('Vertigo', 'MILD'): 2,
 ('Skin sensitivity', 'MILD'): 2,
 ('Alterred Vision', 'MODERATE'): 2,
 ('Skin Rash', 'MILD'): 2,
 ('Sensitivity', 'MILD'): 2,
 ('Diarrhea', 'MILD'): 2,
 ('Cough', 'MILD'): 2,
 ('Oral discomfort', 'MILD'): 1,
 ('Under Arm Pain', 'MILD'): 1,
 ('Runny Nose', 'MILD'): 1,
 ('Pulmonary', 'SEVERE'): 1,
 ('Facial Paralysis', 'MODERATE'): 1,
 ('Appetite Decreased', 'MILD'): 1,
 ('Feeling Abnormal', 'MILD'

In [23]:
words = []
for key in dirty_dict:
    x = {'x': key[0], 'value': dirty_dict[key]*100, 'category': key[1]}
    words.append(x)
    
words = words
words

[{'x': 'Weakness', 'value': 2600, 'category': 'MILD'},
 {'x': 'Joint Pain/Discomfort', 'value': 2400, 'category': 'MILD'},
 {'x': 'Injection Site Irritation/Discomfort',
  'value': 2400,
  'category': 'MILD'},
 {'x': 'Chills', 'value': 1700, 'category': 'MILD'},
 {'x': 'Fever', 'value': 1700, 'category': 'MILD'},
 {'x': 'Back pain', 'value': 900, 'category': 'MILD'},
 {'x': 'Respiratory distress', 'value': 800, 'category': 'MODERATE'},
 {'x': 'Itching', 'value': 700, 'category': 'MILD'},
 {'x': 'Migraine/Headache', 'value': 600, 'category': 'MILD'},
 {'x': 'Fatigue', 'value': 500, 'category': 'MILD'},
 {'x': 'Blood Pressure Change', 'value': 500, 'category': 'MILD'},
 {'x': 'Muscle Aches/Pain/Tightness', 'value': 500, 'category': 'MILD'},
 {'x': 'Nausea/Vomiting', 'value': 400, 'category': 'MILD'},
 {'x': 'Abdominal Pain/Discomfort', 'value': 400, 'category': 'MILD'},
 {'x': 'Chest Discomfort', 'value': 400, 'category': 'MILD'},
 {'x': 'Anxiety', 'value': 400, 'category': 'MILD'},
 {'x

In [None]:
# count the number of occurrences from nearest neighboors
def count_levels(lst):
    mld_ct = 0
    mod_ct = 0
    sev_ct = 0 
    for person in lst:
        if (person == 'MILD'):
            mld_ct = mld_ct + 1
        if (person == 'MODERATE'):
            mod_ct = mod_ct + 1
        if (person == 'SEVERE'):
            sev_ct = sev_ct + 1    
            
    return mld_ct, mod_ct, sev_ct

In [None]:
count_levels(predicted_severity)

In [None]:
cloud_maker(predicted_symptoms)

In [None]:
# save nearest 50 neighbors symptoms as list for wordcloud
predicted_symptoms = data_df['ASSIGNED_GROUP'].iloc[like_users].tolist()
predicted_symptoms[:5]

In [None]:
cloud_maker(predicted_symptoms)

In [None]:
#convert list to string and generate
unique_string=(" ").join(predicted_symptoms)

#create circle mask
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)


#test mask
from PIL import Image
from os import path
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
new_mask = np.array(Image.open(path.join(d, 'circle_mask.png')))

wordcloud = WordCloud(background_color="white", repeat=True,
                      width = 1000, height = 500, mask=mask).generate(unique_string)

plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
#plt.savefig("your_file_name"+".png", bbox_inches='tight')
plt.show()
plt.close()

In [None]:
# Sample Prediction 2
sample_user2 = [[0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]]
predicted_symptoms = knn.predict(sample_user2)
print(severity[predicted_symptoms])

In [None]:
predicted_severity = data_df['SEVERITY_LEVEL'].iloc[like_users].tolist()
predicted_severity[:5]

In [None]:
count_levels(predicted_severity)

In [None]:
tree = KDTree(X)
ind = tree.query([[1.25, 1.35]], k=5)
print(ind)  # indices of 5 closest neighbors


print('People like you reported the following symptoms:')