# Outlier analysis
The application to be given as a result of this problem statement clearly always holds a prefix called 'Robust'.<br>
A statistician would define robust as not sensitive to extreme values or basically resistant to outliers.<br>
So that gives us a task to identify outliers from our data.

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import pmdarima as pm

S = pd.read_csv('../Data/States.csv', index_col = [0], header = [0, 1])
D = pd.read_csv('../Data/Districts.csv', index_col = [0], header = [0, 1])

In [2]:
S.head()

Year,2005,2005,2006,2006,2007,2007,2008,2008,2009,2009,...,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Monsoon,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post,...,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Andhra Pradesh,8.125064,3.616295,5.944778,4.474054,6.748905,3.343564,5.688929,4.252791,6.053105,4.795315,...,6.871261,5.350703,6.617692,5.285926,7.901651,4.5975,7.355833,5.959838,7.74894,4.521017
Arunachal Pradesh,3.131667,3.312083,4.376458,3.68125,3.800208,3.704167,4.776667,3.658542,5.321042,3.962917,...,5.426042,3.645208,4.685,3.43,4.946458,4.072083,4.735208,4.179375,5.7875,4.822083
Assam,2.957709,2.66489,3.184601,2.975361,3.200176,2.84776,3.896374,2.738243,3.781899,2.885703,...,3.697708,2.674555,4.02254,3.091223,3.854004,2.765975,3.540041,3.087709,3.665029,2.899018
Bihar,5.029483,3.690866,4.728042,3.226525,4.858216,3.263597,4.813665,3.392022,4.778539,3.657704,...,5.181772,4.305143,5.7199,3.412368,5.270182,3.659309,5.645378,3.847776,5.994683,3.51297
Chandigarh,13.671111,13.355,13.71625,13.672857,12.32875,16.948182,17.018182,15.741818,16.303636,16.810909,...,18.251818,16.664545,16.969091,18.476364,19.495,18.954,19.689,17.628182,18.203636,16.660909


In [3]:
D.head()

Year,2005,2005,2006,2006,2007,2007,2008,2008,2009,2009,...,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Monsoon,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post,...,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post
Andhra Pradesh_Anantapur,12.946818,7.335909,8.597727,9.532727,11.475,6.940227,9.089545,5.965227,7.397955,6.210909,...,9.7775,7.975897,9.458974,9.937949,12.611316,7.752973,11.002857,10.207059,12.710882,7.668824
Andhra Pradesh_Chittoor,10.997838,6.366757,6.925405,6.55027,8.979189,6.00973,7.401622,7.142703,7.902162,8.031622,...,9.147353,7.591515,6.116452,6.591379,8.715769,5.340833,8.071739,8.992609,9.406364,7.654545
Andhra Pradesh_East Godavari,5.357714,2.818286,4.750278,2.977778,4.904167,2.796111,4.300278,3.517778,4.800833,3.136667,...,4.448824,3.735588,8.298182,3.079375,8.9725,2.601333,9.932414,3.013448,9.457586,2.773929
Andhra Pradesh_Guntur,5.145714,1.930286,3.635714,2.398571,4.230857,2.121143,3.554857,2.29,3.416286,3.054286,...,5.605806,4.813871,5.969677,3.649655,5.96069,3.91931,6.018966,5.088276,6.287143,3.209259
Andhra Pradesh_Krishna,7.294,3.138,5.17,3.4468,5.4368,3.1512,5.3584,3.0396,5.3736,4.5488,...,5.6628,4.4732,6.236667,3.81087,6.112174,4.913182,6.801818,4.614348,5.963478,4.122609


We did find a way to identify the outliers in a given sample space through an article and also implemented a function to make that work.

In [4]:
def outlier(x):
    out = pd.Series([False for i in range(len(x))])
    x = x.dropna()
    for i in np.where(np.abs(stats.zscore(x)) > 3)[0]:
        out.iloc[i] = True
    return out

zscore approximates the sample space across 0 thus identifying outliers as being pretty far from that point.<br>
The article suggests that 3 was a standard value all data scientists use to identify outliers through the zscore method.

In [5]:
outliers_mask = S.apply(outlier, axis = 1)
outliers_mask.columns = S.columns
S = S[outliers_mask.applymap(lambda l : not l)]
S.head()

Year,2005,2005,2006,2006,2007,2007,2008,2008,2009,2009,...,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Monsoon,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post,...,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Andhra Pradesh,8.125064,3.616295,5.944778,4.474054,6.748905,3.343564,5.688929,4.252791,6.053105,4.795315,...,6.871261,5.350703,6.617692,5.285926,7.901651,4.5975,7.355833,5.959838,7.74894,4.521017
Arunachal Pradesh,3.131667,3.312083,4.376458,3.68125,3.800208,3.704167,4.776667,3.658542,5.321042,3.962917,...,5.426042,3.645208,4.685,3.43,4.946458,4.072083,4.735208,4.179375,5.7875,4.822083
Assam,2.957709,2.66489,3.184601,2.975361,3.200176,2.84776,3.896374,2.738243,3.781899,2.885703,...,3.697708,2.674555,4.02254,3.091223,3.854004,2.765975,3.540041,3.087709,3.665029,2.899018
Bihar,5.029483,3.690866,4.728042,3.226525,4.858216,3.263597,4.813665,3.392022,4.778539,3.657704,...,5.181772,4.305143,5.7199,3.412368,5.270182,3.659309,5.645378,3.847776,5.994683,3.51297
Chandigarh,13.671111,13.355,13.71625,13.672857,12.32875,16.948182,17.018182,15.741818,16.303636,16.810909,...,18.251818,16.664545,16.969091,18.476364,19.495,18.954,19.689,17.628182,18.203636,16.660909


In [6]:
S.to_csv('../Data/RobustStates.csv')

In [7]:
outliers_mask = D.apply(outlier, axis = 1)
outliers_mask.columns = D.columns
D = D[outliers_mask.applymap(lambda l : not l)]
D.head()

Year,2005,2005,2006,2006,2007,2007,2008,2008,2009,2009,...,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Monsoon,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post,...,Pre,Post,Pre,Post,Pre,Post,Pre,Post,Pre,Post
Andhra Pradesh_Anantapur,12.946818,7.335909,8.597727,9.532727,11.475,6.940227,9.089545,5.965227,7.397955,6.210909,...,9.7775,7.975897,9.458974,9.937949,12.611316,7.752973,11.002857,10.207059,12.710882,7.668824
Andhra Pradesh_Chittoor,10.997838,6.366757,6.925405,6.55027,8.979189,6.00973,7.401622,7.142703,7.902162,8.031622,...,9.147353,7.591515,6.116452,6.591379,8.715769,5.340833,8.071739,8.992609,9.406364,7.654545
Andhra Pradesh_East Godavari,5.357714,2.818286,4.750278,2.977778,4.904167,2.796111,4.300278,3.517778,4.800833,3.136667,...,4.448824,3.735588,8.298182,3.079375,8.9725,2.601333,9.932414,3.013448,9.457586,2.773929
Andhra Pradesh_Guntur,5.145714,1.930286,3.635714,2.398571,4.230857,2.121143,3.554857,2.29,3.416286,3.054286,...,5.605806,4.813871,5.969677,3.649655,5.96069,3.91931,6.018966,5.088276,6.287143,3.209259
Andhra Pradesh_Krishna,7.294,3.138,5.17,3.4468,5.4368,3.1512,5.3584,3.0396,5.3736,4.5488,...,5.6628,4.4732,6.236667,3.81087,6.112174,4.913182,6.801818,4.614348,5.963478,4.122609


In [8]:
D.isna().sum()

Year  Monsoon
2005  Pre        0
      Post       2
2006  Pre        0
      Post       0
2007  Pre        0
      Post       0
2008  Pre        0
      Post       0
2009  Pre        0
      Post       0
2010  Pre        1
      Post       0
2011  Pre        0
      Post       0
2012  Pre        0
      Post       0
2013  Pre        2
      Post       1
2014  Pre        0
      Post       1
2015  Pre        6
      Post       0
2016  Pre        1
      Post       0
2017  Pre        0
      Post       0
2018  Pre        0
      Post       2
2019  Pre        5
      Post       4
dtype: int64

In [9]:
D.loc['Tamil Nadu_Cuddalore']['2005']['Post'] = np.nanmean(D.loc['Tamil Nadu_Cuddalore'].iloc[3::2])
D.loc['Tamil Nadu_Cuddalore']['2005']

Monsoon
Pre     14.756333
Post    13.358786
Name: Tamil Nadu_Cuddalore, dtype: float64

In [10]:
D.loc['Uttar Pradesh_Muzaffarnagar']['2005']['Post'] = np.nanmean(D.loc['Uttar Pradesh_Muzaffarnagar'].iloc[3::2])
D.loc['Uttar Pradesh_Muzaffarnagar']['2005']

Monsoon
Pre     5.037500
Post    5.818607
Name: Uttar Pradesh_Muzaffarnagar, dtype: float64

In [11]:
D.loc['Assam_Sivasagar']['2010']['Pre'] = np.nanmean(D.loc['Assam_Sivasagar'].iloc[::2])
D.loc['Assam_Sivasagar']['2010']

Monsoon
Pre     3.211905
Post    2.744000
Name: Assam_Sivasagar, dtype: float64

As we can see that there are only 3 NaN values before 2012 which can't be filled using ARIMA, we use the average of all other values to fill them up.<br>
Next we'll write a function that fills in values based on ARIMA for all the other NaNs.

In [12]:
def arima(ser, no_of_p = 1):
    arima_series = pm.auto_arima(
        y = ser,
        start_p = 0,
        max_p = 2,
        d = 1,
        start_q = 0,
        max_q = 2,
        trace = False,
        seasonal = False,
        error_action = 'ignore',
        suppress_warnings = True
    )
    return list(arima_series.predict(n_periods = no_of_p))[0]

In [13]:
ds = list(D.isna().sum(axis = 1).sort_values().tail(19).index)
ds

['Uttar Pradesh_Farrukhabad',
 'Uttar Pradesh_Etah',
 'Haryana_Rewari',
 'Gujarat_Valsad',
 'Punjab_Moga',
 'Gujarat_Surat',
 'Rajasthan_Bikaner',
 'Assam_Dibrugarh',
 'Madhya Pradesh_Guna',
 'Assam_Kokrajhar',
 'Meghalaya_South Garo Hills',
 'Gujarat_Dahod',
 'Gujarat_Bharuch',
 'Punjab_Muktsar',
 'Odisha_Bhadrak',
 'Assam_Chirang',
 'Haryana_Sirsa',
 'Daman and Diu_Diu',
 'Haryana_Kurukshetra']

In [14]:
for i in ds:
    Pre = D.loc[i].iloc[::2]
    li = list()
    for j in range(len(Pre)):
        if str(Pre[j]) != 'nan':
            li.append(Pre[j])
        else:
            Pre[j] = arima(ser = li)
            break
            
for i in ds:
    Post = D.loc[i].iloc[1::2]
    li = list()
    for j in range(len(Post)):
        if str(Post[j]) != 'nan':
            li.append(Post[j])
        else:
            Post[j] = arima(ser = li)
            break

  return np.roots(self.polynomial_reduced_ar)**-1


In [15]:
D.isna().sum().sum()

0

In [16]:
D.to_csv('../Data/RobustDistricts.csv')