In [31]:
import pandas as pd
import sklearn
import numpy as np

In [32]:
df = pd.read_csv("cleaned_data.csv")

In [33]:
df.columns

Index(['VAERS_ID', 'RECVDATE', 'STATE', 'AGE_YRS', 'SEX', 'SYMPTOM_TEXT',
       'DIED', 'HOSPITAL', 'RECOVD', 'VAX_DATE', 'NUMDAYS', 'V_ADMINBY',
       'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'OFC_VISIT', 'ER_ED_VISIT',
       'ALLERGIES', 'VAX_MANU', 'VAX_LOT', 'VAX_SITE', 'VAX_NAME', 'SYMPTOM1',
       'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5', 'SYMPTOM6', 'SYMPTOM7',
       'SYMPTOM8', 'SYMPTOM9', 'SYMPTOM10'],
      dtype='object')

In [34]:
df['VAX_MANU'].unique()

array(['MODERNA', 'PFIZER\\BIONTECH', 'UNKNOWN MANUFACTURER', 'JANSSEN'],
      dtype=object)

In [35]:
y = df['VAX_MANU']
y

0                MODERNA
1                MODERNA
2        PFIZER\BIONTECH
3                MODERNA
4                MODERNA
              ...       
31500            MODERNA
31501            MODERNA
31502    PFIZER\BIONTECH
31503            MODERNA
31504            MODERNA
Name: VAX_MANU, Length: 31505, dtype: object

In [36]:
# preliminary feature list
features = ['AGE_YRS', 'SEX', 'DIED', 'HOSPITAL', 'OFC_VISIT', 'ER_ED_VISIT', 'NUMDAYS', 'SYMPTOM1',
       'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5', 'SYMPTOM6', 'SYMPTOM7',
       'SYMPTOM8', 'SYMPTOM9', 'SYMPTOM10']

In [37]:
X = df[features]
X

Unnamed: 0,AGE_YRS,SEX,DIED,HOSPITAL,OFC_VISIT,ER_ED_VISIT,NUMDAYS,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,SYMPTOM6,SYMPTOM7,SYMPTOM8,SYMPTOM9,SYMPTOM10
0,33.000000,F,N,N,Y,N,2.0,Dysphagia,Epiglottitis,,,,,,,,
1,73.000000,F,N,N,Y,N,0.0,Anxiety,Dyspnoea,,,,,,,,
2,23.000000,F,N,N,N,Y,0.0,Chest discomfort,Dysphagia,Pain in extremity,Visual impairment,,,,,,
3,58.000000,F,N,N,N,N,0.0,Dizziness,Fatigue,Mobility decreased,,,,,,,
4,47.000000,F,N,N,N,N,7.0,Injection site erythema,Injection site pruritus,Injection site swelling,Injection site warmth,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31500,85.000000,M,N,Y,N,N,26.0,Acute hepatic failure,Hepatic enzyme,Hepatic enzyme increased,,,,,,,
31501,88.000000,F,N,Y,N,Y,43.0,Dysphagia,Hernia,Obstruction,,,,,,,
31502,59.000000,F,Y,N,N,N,10.0,Haemophagocytic lymphohistiocytosis,SARS-CoV-2 test,,,,,,,,
31503,57.000000,M,Y,N,N,N,0.0,Death,,,,,,,,,


In [40]:
X['SYMPTOM1'].unique().size

1398

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
cnb = CategoricalNB()

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rf = RandomForestClassifier(random_state=0)

In [51]:
from sklearn.preprocessing import OneHotEncoder

In [59]:
for i in range(1, 11):
    colname = "SYMPTOM" + str(i)
    X[colname].fillna("None", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [62]:
X.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace=True)


In [69]:
cats = ['SEX', 'DIED', 'HOSPITAL', 'OFC_VISIT', 'ER_ED_VISIT', 'SYMPTOM1',
       'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5', 'SYMPTOM6', 'SYMPTOM7',
       'SYMPTOM8', 'SYMPTOM9', 'SYMPTOM10']

In [79]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [80]:
res = enc.fit_transform(X[cats])

In [81]:
res

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [83]:
len(res)

31505

In [76]:
X[['AGE_YRS', 'NUMDAYS']]

Unnamed: 0,AGE_YRS,NUMDAYS
0,33.000000,2.0
1,73.000000,0.0
2,23.000000,0.0
3,58.000000,0.0
4,47.000000,7.0
...,...,...
31500,85.000000,26.0
31501,88.000000,43.0
31502,59.000000,10.0
31503,57.000000,0.0


In [84]:
concat = np.concatenate([X[['AGE_YRS', 'NUMDAYS']], res], axis=1)

In [85]:
len(concat)

31505

In [86]:
clf = RandomForestClassifier(random_state=0)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(concat, y, test_size=0.3, random_state=0)

In [88]:
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [90]:
pred = clf.predict(X_test)

In [91]:
sum(pred == y_test)/len(y_test)

0.6219847651290732

Now try KMeans clustering to figure out what the unknown vaccine is.

In [93]:
df['VAX_MANU'].value_counts()

MODERNA                 15334
PFIZER\BIONTECH         15090
JANSSEN                  1067
UNKNOWN MANUFACTURER       14
Name: VAX_MANU, dtype: int64

In [94]:
from sklearn.cluster import KMeans

In [96]:
kmeans = KMeans(n_clusters=3, random_state=0).fit_predict(concat)
kmeans

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [99]:
pd.Series(kmeans).value_counts()

0    31479
2       14
1       12
dtype: int64