<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-pandas,-pyod,-and-scikit-learn,-along-with-the-Covid-case-data" data-toc-modified-id="Load-pandas,-pyod,-and-scikit-learn,-along-with-the-Covid-case-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load pandas, pyod, and scikit-learn, along with the Covid case data</a></span></li><li><span><a href="#Create-a-standardized-data-frame-of-the-analysis-columns" data-toc-modified-id="Create-a-standardized-data-frame-of-the-analysis-columns-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create a standardized data frame of the analysis columns</a></span></li><li><span><a href="#Run-the-KNN-model-and-generate-anomaly-scores" data-toc-modified-id="Run-the-KNN-model-and-generate-anomaly-scores-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Run the KNN model and generate anomaly scores</a></span></li><li><span><a href="#Show-the-predictions-from-the-model" data-toc-modified-id="Show-the-predictions-from-the-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Show the predictions from the model</a></span></li><li><span><a href="#Show-the-COVID-data-for-the-outliers" data-toc-modified-id="Show-the-COVID-data-for-the-outliers-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Show the COVID data for the outliers</a></span></li></ul></div>

# Load pandas, pyod, and scikit-learn, along with the Covid case data

In [1]:
import pandas as pd
from pyod.models.knn import KNN
from sklearn.preprocessing import StandardScaler

In [2]:
# pd.set_option('display.width', 80)
# pd.set_option('display.max_columns', 7)
# pd.set_option('display.max_rows', 20)
# pd.options.display.float_format = '{:,.2f}'.format

In [3]:
import watermark
%load_ext watermark

%watermark -n -v -iv

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

pandas   : 1.2.1
watermark: 2.1.0
json     : 2.0.9



In [4]:
covidtotals = pd.read_csv('data/covidtotals.csv')
covidtotals.set_index('iso_code', inplace=True)

# Create a standardized data frame of the analysis columns

In [5]:
standardizer = StandardScaler()

In [6]:
analysisvars = [
    'location', 'total_cases_pm', 'total_deaths_pm', 'pop_density',
    'median_age', 'gdp_per_capita'
]

In [7]:
covidanalysis = covidtotals.loc[:, analysisvars].dropna()

In [9]:
covidanalysis.tail(2)

Unnamed: 0_level_0,location,total_cases_pm,total_deaths_pm,pop_density,median_age,gdp_per_capita
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ZMB,Zambia,57.496,0.381,22.995,17.7,3689.251
ZWE,Zimbabwe,11.976,0.269,42.729,19.6,1899.775


In [10]:
covidanalysisstand = standardizer.fit_transform(covidanalysis.iloc[:, 1:])

# Run the KNN model and generate anomaly scores

In [12]:
clf_name = 'KNN'
clf = KNN(contamination=0.1)

In [13]:
clf.fit(covidanalysisstand)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

In [15]:
y_pred = clf.labels_
y_pred[:3]

array([0, 0, 0])

In [16]:
y_scores = clf.decision_scores_
y_scores[:3]

array([0.15961874, 0.43272345, 0.25485372])

# Show the predictions from the model

In [17]:
pred = pd.DataFrame(zip(y_pred, y_scores),
                    columns=['outlier', 'scores'],
                    index=covidanalysis.index)

In [18]:
pred.sample(10, random_state=1)

Unnamed: 0_level_0,outlier,scores
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1
LBY,0,0.367532
NLD,1,1.564052
BTN,0,0.185273
HTI,0,0.433058
EST,0,0.464279
LCA,0,0.431069
PER,0,1.411336
BRB,0,0.769047
MDA,0,0.912486
NAM,0,0.310553


In [19]:
pred['outlier'].value_counts()

0    157
1     18
Name: outlier, dtype: int64

In [20]:
pred.groupby(['outlier'])[['scores']].agg(['min', 'median', 'max'])

Unnamed: 0_level_0,scores,scores,scores
Unnamed: 0_level_1,min,median,max
outlier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.081259,0.364803,1.519804
1,1.553681,2.096079,9.483549


# Show the COVID data for the outliers

In [21]:
covidanalysis.join(pred).loc[
    pred['outlier'] == 1,
    ['location', 'total_cases_pm', 'total_deaths_pm', 'scores']].sort_values(
        ['scores'], ascending=False)

Unnamed: 0_level_0,location,total_cases_pm,total_deaths_pm,scores
iso_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SGP,Singapore,5962.727,3.931,9.483549
QAT,Qatar,19753.146,13.19,8.001605
HKG,Hong Kong,0.0,0.0,7.771722
BEL,Belgium,5037.354,816.852,3.544827
BHR,Bahrain,6698.468,11.166,2.842906
LUX,Luxembourg,6418.776,175.726,2.442886
ESP,Spain,5120.952,580.197,2.176441
KWT,Kuwait,6332.42,49.642,2.127519
GBR,United Kingdom,4047.403,566.965,2.103458
ITA,Italy,3853.985,552.663,2.0887
