We are integrating malaria data with the following datasets for measuring the malaria indices over the course of years:

- NDVI
- NTL
- Temperature
- Rainfall

Currently following are integrated in one table:

- NDVI ✔
- NTL ✔
- Temperature ✔
- Rainfall ✔

After Integrating them, the data is converted into Supervised timeseries format.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder

### Malaria Table

- Top 5 Entries

In [2]:
country='India'
df_malaria = pd.read_csv(f'./Data Saperation/{country}/{country}_malaria.csv').drop('Unnamed: 0', axis=1)
print(f'Total entries in Malaria table:\t{df_malaria.shape[0]}')
df_malaria.head(5)

Total entries in Malaria table:	28526


Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,pfir_2008,pfir_2009,pfir_2010,pfir_2011,pfir_2012,pfir_2013,pfir_2014,pfir_2015,pfir_2016,pfir_2017
0,India,IA,2016,IA201400010001,640,South Andaman,2014,Urban,11.670645,92.745799,...,0.05855,0.083761,0.093975,0.077659,0.047645,0.010011,0.001801,0.000904,0.000942,0.000942
1,India,IA,2016,IA201400010002,638,Nicobars,2014,Rural,9.143844,92.826752,...,0.000889,0.002208,0.003995,0.006132,0.004235,0.000725,0.00014,0.000112,0.00012,0.00012
2,India,IA,2016,IA201400010003,638,Nicobars,2014,Rural,9.212246,92.753633,...,0.000882,0.002179,0.003862,0.005853,0.003941,0.000627,0.000109,8.2e-05,8.8e-05,8.8e-05
3,India,IA,2016,IA201400010004,638,Nicobars,2014,Rural,9.165413,92.742696,...,0.000928,0.002307,0.004116,0.006255,0.004272,0.000705,0.000129,9.9e-05,0.000106,0.000106
4,India,IA,2016,IA201400010005,638,Nicobars,2014,Rural,8.307356,93.093792,...,0.001083,0.002861,0.00525,0.007984,0.005847,0.001111,0.000234,0.000191,0.000203,0.000203


- Columns and Number of Columns

1. country        : Contains Country name
2. country_code   : Contains Country Code
3. end_year       : ?
4. dhsid          : ?
5. dhsregco       : ?
6. dhsregna       : ?
7. dhsyear        : ?
8. URBAN_RURA     : ?
9. latnum         : ?
10. longnum        : ?
11. v001           : ?
12. ->19.\
    pfir_2000 -> 17 : incidence rate in that year


In [3]:
print(f'Total columns: {len(df_malaria.columns)}')
print(f'Column names: ', df_malaria.columns.values)


Total columns: 29
Column names:  ['country' 'country_code' 'end_year' 'dhsid' 'dhsregco' 'dhsregna'
 'dhsyear' 'URBAN_RURA' 'latnum' 'longnum' 'v001' 'pfir_2000' 'pfir_2001'
 'pfir_2002' 'pfir_2003' 'pfir_2004' 'pfir_2005' 'pfir_2006' 'pfir_2007'
 'pfir_2008' 'pfir_2009' 'pfir_2010' 'pfir_2011' 'pfir_2012' 'pfir_2013'
 'pfir_2014' 'pfir_2015' 'pfir_2016' 'pfir_2017']


### NDVI Table

In [4]:
df_ndvi = pd.read_csv(f'./Data Saperation/{country}/{country}_ndvi.csv').drop('Unnamed: 0', axis=1)
print(f'Total Entries in NDVI table:\t{df_ndvi.shape[0]}')
df_ndvi.head(5)

Total Entries in NDVI table:	28526


Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,N2019_3,N2019_4,N2019_5,N2019_6,N2019_7,N2019_8,N2019_9,N2019_10,N2019_11,N2019_12
0,India,IA,2016,IA201400010001,640,South Andaman,,Urban,11.670645,92.745799,...,0.594767,0.598962,0.669265,0.640744,0.669232,0.610302,0.708107,0.670936,0.713656,0.685143
1,India,IA,2016,IA201400010002,638,Nicobars,,Rural,9.143844,92.826752,...,0.75916,0.772052,0.810813,0.812775,0.836262,0.801029,0.750419,0.836247,0.790905,0.772825
2,India,IA,2016,IA201400010003,638,Nicobars,,Rural,9.212246,92.753633,...,0.810209,0.769421,0.792985,0.787186,0.806806,0.816891,0.788199,0.823324,0.833278,0.818003
3,India,IA,2016,IA201400010004,638,Nicobars,,Rural,9.165413,92.742696,...,0.819505,0.780735,0.805998,0.812258,0.8219,0.80856,0.795652,0.827302,0.82767,0.808355
4,India,IA,2016,IA201400010005,638,Nicobars,,Rural,8.307356,93.093792,...,0.675175,0.729355,0.628397,0.695162,0.717978,0.777969,0.773211,0.760427,0.793855,0.792485


- Column names and Number of Columns

1. country : Country Name
2. country_code : Country Code
3. end_year : ?
4. dhsid : ?
5. dhsregco : ?
6. dhsregna : ?
7. dhsyear : ?
8. URBAN_RURA : Is the Area Urban or rural
9. latnum : ?
10. longnum : ?
11. v001 : ?
12. What is the meaning of N200._. column names??

In [5]:
print(f'Total Columns in NDVI:\t{len(df_ndvi.columns.values)}')
df_ndvi.columns.values
# 20 14 15 17

Total Columns in NDVI:	251


array(['country', 'country_code', 'end_year', 'dhsid', 'dhsregco',
       'dhsregna', 'dhsyear', 'URBAN_RURA', 'latnum', 'longnum', 'v001',
       'N2000_1', 'N2000_2', 'N2000_3', 'N2000_4', 'N2000_5', 'N2000_6',
       'N2000_7', 'N2000_8', 'N2000_9', 'N2000_10', 'N2000_11',
       'N2000_12', 'N2001_1', 'N2001_2', 'N2001_3', 'N2001_4', 'N2001_5',
       'N2001_6', 'N2001_7', 'N2001_8', 'N2001_9', 'N2001_10', 'N2001_11',
       'N2001_12', 'N2002_1', 'N2002_2', 'N2002_3', 'N2002_4', 'N2002_5',
       'N2002_6', 'N2002_7', 'N2002_8', 'N2002_9', 'N2002_10', 'N2002_11',
       'N2002_12', 'N2003_1', 'N2003_2', 'N2003_3', 'N2003_4', 'N2003_5',
       'N2003_6', 'N2003_7', 'N2003_8', 'N2003_9', 'N2003_10', 'N2003_11',
       'N2003_12', 'N2004_1', 'N2004_2', 'N2004_3', 'N2004_4', 'N2004_5',
       'N2004_6', 'N2004_7', 'N2004_8', 'N2004_9', 'N2004_10', 'N2004_11',
       'N2004_12', 'N2005_1', 'N2005_2', 'N2005_3', 'N2005_4', 'N2005_5',
       'N2005_6', 'N2005_7', 'N2005_8', 'N2005_9', 'N

### NTL Table

- Top 5 entries and total rows

In [6]:
df_ntl = pd.read_csv(f'./Data Saperation/{country}/{country}_ntl.csv').drop('Unnamed: 0', axis=1)
print(f'Total Number of rows are:\t{df_ntl.shape[0]}')
df_ntl.head(5)

Total Number of rows are:	28526


Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,NL2004,NL2005,NL2006,NL2007,NL2008,NL2009,NL2010,NL2011,NL2012,NL2013
0,India,IA,2016,IA201400010001,640,South Andaman,2014,Urban,11.670645,92.745799,...,25.0988,18.5679,24.9753,29.9259,27.4938,27.642,30.8025,31.6543,33.2963,34.9136
1,India,IA,2016,IA201400010002,638,Nicobars,2014,Rural,9.143844,92.826752,...,1.6183,0.570978,0.561514,1.14196,0.85489,1.21767,2.07571,1.49842,1.6183,1.40063
2,India,IA,2016,IA201400010003,638,Nicobars,2014,Rural,9.212246,92.753633,...,1.73186,0.769716,0.665615,1.70662,0.864353,1.23344,2.98738,1.57098,2.44795,1.74132
3,India,IA,2016,IA201400010004,638,Nicobars,2014,Rural,9.165413,92.742696,...,1.49211,0.55205,0.577287,1.32808,0.716088,1.02524,2.76025,1.30599,1.84227,1.32177
4,India,IA,2016,IA201400010005,638,Nicobars,2014,Rural,8.307356,93.093792,...,0.0,0.0,0.0,0.072555,0.0,0.0,0.078864,0.0,0.0,0.0


- Number of columns and different names.

1. country: Country name
2. country_code: Code of country
3. end_year: ?
4. dhsid: ?
5. dhsregco: ?
6. dhsregna: ?
7. dhsyear: ?
8. URBAN_RURA: ?
9. latnum: ?
10. longnum: ?
11. v001: ?
12. Longitude: Longitude on map locations
13. Latitude: Latitude on map locations
14.     'NL1992', 'NL1993', 'NL1994', 'NL1995', 'NL1996', 'NL1997', 'NL1998', 'NL1999', 'NL2000', 'NL2001', 'NL2002', 'NL2003', 'NL2004', 'NL2005', 'NL2006', 'NL2007', 'NL2008', 'NL2009', 'NL2010', 'NL2011', 'NL2012', 'NL2013'
15.     'lights_2014', 'lights_2015', 'lights_2016', 'lights_2017',


In [7]:
print(f'Total Number of columns:\t{len(df_ntl.columns.values)}')
df_ntl.columns.values

Total Number of columns:	33


array(['country', 'country_code', 'end_year', 'dhsid', 'dhsregco',
       'dhsregna', 'dhsyear', 'URBAN_RURA', 'latnum', 'longnum', 'v001',
       'NL1992', 'NL1993', 'NL1994', 'NL1995', 'NL1996', 'NL1997',
       'NL1998', 'NL1999', 'NL2000', 'NL2001', 'NL2002', 'NL2003',
       'NL2004', 'NL2005', 'NL2006', 'NL2007', 'NL2008', 'NL2009',
       'NL2010', 'NL2011', 'NL2012', 'NL2013'], dtype=object)

In [8]:
df_ntl.tail()

Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,NL2004,NL2005,NL2006,NL2007,NL2008,NL2009,NL2010,NL2011,NL2012,NL2013
28521,India,IA,2016,IA201400360477,540,Warangal,2014,Urban,17.990663,79.603454,...,44.3086,39.7901,42.9753,48.3333,47.4074,48.7778,52.8642,54.037,55.037,56.321
28522,India,IA,2016,IA201400360479,540,Warangal,2014,Rural,17.571664,79.952492,...,6.42902,5.21136,6.3123,8.26498,8.39117,8.30284,12.1325,10.0379,10.9874,11.0726
28523,India,IA,2016,IA201400360480,540,Warangal,2014,Rural,17.939231,79.512509,...,17.795,14.2871,17.0095,19.5079,19.836,18.8833,26.0189,21.3312,24.2713,25.1041
28524,India,IA,2016,IA201400360481,536,Hyderabad,2014,Urban,17.420175,78.491013,...,63.0,62.8642,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0
28525,India,IA,2016,IA201400360482,541,Khammam,2014,Rural,17.313331,79.963638,...,4.2776,3.5836,4.11672,5.20505,5.05363,5.23975,7.64038,6.57098,6.88328,7.04416


In [9]:
df_ntl.shape

(28526, 33)

### Temperature Table

- Top 5 rows and total rows. 

In [10]:
df_temperature = pd.read_csv(f'./Data Saperation/{country}/{country}_temperature.csv').drop('Unnamed: 0', axis=1)
print(f'Total rows in Temperature Table:\t{df_temperature.shape[0]}')
df_temperature.head(5)

Total rows in Temperature Table:	28526


Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,t2019_3,t2019_4,t2019_5,t2019_6,t2019_7,t2019_8,t2019_9,t2019_10,t2019_11,t2019_12
0,India,IA,2016,IA201400010001,640,South Andaman,2014,Urban,11.670645,92.745799,...,28.0,29.5,29.2,27.6,27.9,26.5,26.5,28.2,28.1,27.6
1,India,IA,2016,IA201400010002,638,Nicobars,2014,Rural,9.143844,92.826752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,India,IA,2016,IA201400010003,638,Nicobars,2014,Rural,9.212246,92.753633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,India,IA,2016,IA201400010004,638,Nicobars,2014,Rural,9.165413,92.742696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,India,IA,2016,IA201400010005,638,Nicobars,2014,Rural,8.307356,93.093792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Rainfall

In [11]:
df_rainfall = pd.read_csv(f'./Data Saperation/{country}/{country}_rainfall.csv').drop('Unnamed: 0', axis=1)
df_rainfall.head()

Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,p2019_3,p2019_4,p2019_5,p2019_6,p2019_7,p2019_8,p2019_9,p2019_10,p2019_11,p2019_12
0,India,IA,2016,IA201400010001,640,South Andaman,2014,Urban,11.670645,92.745799,...,108.6,18.8,227.9,720.5,143.7,428.0,651.4,162.6,152.9,1.7
1,India,IA,2016,IA201400010002,638,Nicobars,2014,Rural,9.143844,92.826752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,India,IA,2016,IA201400010003,638,Nicobars,2014,Rural,9.212246,92.753633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,India,IA,2016,IA201400010004,638,Nicobars,2014,Rural,9.165413,92.742696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,India,IA,2016,IA201400010005,638,Nicobars,2014,Rural,8.307356,93.093792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### EDA


In [12]:
print(f'''
    Malaria Shape:\t{df_malaria.shape}
    NDVI Shape:\t\t{df_ndvi.shape}
    NTL Shape:\t\t{df_ntl.shape}
    Temperature Shape:\t{df_temperature.shape}
''')


    Malaria Shape:	(28526, 29)
    NDVI Shape:		(28526, 251)
    NTL Shape:		(28526, 33)
    Temperature Shape:	(28526, 491)



In [13]:
print(f'''
Malaria table:\n\t{df_malaria.end_year.value_counts()}
----
NDVI table:\n\t{df_ndvi.end_year.value_counts()}
----
NTL table:\n\t{df_ntl.end_year.value_counts()}
----
Temperature table:\n\t{df_temperature.end_year.value_counts()}
----

''')


Malaria table:
	2016    28526
Name: end_year, dtype: int64
----
NDVI table:
	2016    28526
Name: end_year, dtype: int64
----
NTL table:
	2016    28526
Name: end_year, dtype: int64
----
Temperature table:
	2016    28526
Name: end_year, dtype: int64
----




In [14]:
df_malaria.drop(['country_code', 'dhsid', 'dhsregco', 'v001', 'end_year', 'dhsyear'], axis = 1, inplace = True)
df_malaria.columns

Index(['country', 'dhsregna', 'URBAN_RURA', 'latnum', 'longnum', 'pfir_2000',
       'pfir_2001', 'pfir_2002', 'pfir_2003', 'pfir_2004', 'pfir_2005',
       'pfir_2006', 'pfir_2007', 'pfir_2008', 'pfir_2009', 'pfir_2010',
       'pfir_2011', 'pfir_2012', 'pfir_2013', 'pfir_2014', 'pfir_2015',
       'pfir_2016', 'pfir_2017'],
      dtype='object')

In [15]:
df_ndvi.drop(['country', 'country_code', 'dhsid', 'end_year', 'dhsregna', 'dhsyear', 'dhsregco'], axis = 1, inplace = True)
df_ndvi = df_ndvi.dropna()
df_ndvi.columns

Index(['URBAN_RURA', 'latnum', 'longnum', 'v001', 'N2000_1', 'N2000_2',
       'N2000_3', 'N2000_4', 'N2000_5', 'N2000_6',
       ...
       'N2019_3', 'N2019_4', 'N2019_5', 'N2019_6', 'N2019_7', 'N2019_8',
       'N2019_9', 'N2019_10', 'N2019_11', 'N2019_12'],
      dtype='object', length=244)

In [16]:
df_ntl.drop(['country', 'country_code', 'dhsid', 'end_year', 'dhsregco', 'v001', 'dhsregna', 'dhsyear', 'dhsregco'], axis = 1, inplace = True)
df_ntl = df_ntl.dropna()
df_ntl.columns

Index(['URBAN_RURA', 'latnum', 'longnum', 'NL1992', 'NL1993', 'NL1994',
       'NL1995', 'NL1996', 'NL1997', 'NL1998', 'NL1999', 'NL2000', 'NL2001',
       'NL2002', 'NL2003', 'NL2004', 'NL2005', 'NL2006', 'NL2007', 'NL2008',
       'NL2009', 'NL2010', 'NL2011', 'NL2012', 'NL2013'],
      dtype='object')

In [17]:
df_temperature.drop(['country', 'country_code', 'dhsid', 'end_year', 'dhsregna', 'dhsyear', 'dhsregco'], axis = 1, inplace = True)
df_temperature = df_temperature.dropna()
df_temperature.columns

Index(['URBAN_RURA', 'latnum', 'longnum', 'v001', 't1980_1', 't1980_2',
       't1980_3', 't1980_4', 't1980_5', 't1980_6',
       ...
       't2019_3', 't2019_4', 't2019_5', 't2019_6', 't2019_7', 't2019_8',
       't2019_9', 't2019_10', 't2019_11', 't2019_12'],
      dtype='object', length=484)

In [18]:
years = ['2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
        '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017']
df_malaria.columns = ['country', 'dhsregna','URBAN_RURA', 'latnum', 'longnum', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
        '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017']
ser_int_lst = []
for index, row in df_malaria.iterrows():
    year = 2000
    
    for i in range(18):
        ser = {'year':str(year),
            'latnum':row['latnum'],
          'longnum':row['longnum'],
          'URBAN_RURA':row['URBAN_RURA'],
          'pfir': row[str(year)],
               'country':row['country'],
            'dhsregna':row['dhsregna']}
        ser_int_lst.append(ser)
        year+=1  
# print(ser_int_lst)
new_df_mal = pd.DataFrame(ser_int_lst)
new_df_mal.year = new_df_mal.year.astype(int)
new_df_mal

Unnamed: 0,year,latnum,longnum,URBAN_RURA,pfir,country,dhsregna
0,2000,11.670645,92.745799,Urban,0.101909,India,South Andaman
1,2001,11.670645,92.745799,Urban,0.082865,India,South Andaman
2,2002,11.670645,92.745799,Urban,0.077327,India,South Andaman
3,2003,11.670645,92.745799,Urban,0.077326,India,South Andaman
4,2004,11.670645,92.745799,Urban,0.087886,India,South Andaman
...,...,...,...,...,...,...,...
513463,2013,17.313331,79.963638,Rural,0.003354,India,Khammam
513464,2014,17.313331,79.963638,Rural,0.006327,India,Khammam
513465,2015,17.313331,79.963638,Rural,0.011383,India,Khammam
513466,2016,17.313331,79.963638,Rural,0.003960,India,Khammam


In [20]:
# Not in use
class NormYear():
    def __init__(self):
        self.max = 0
    def normalize(self, df:pd.DataFrame):
        self.max = df.year.max()
        df.year = df.year / self.max
        return df
    def denormalize(self, df:pd.DataFrame):
        df.year = df.year * self.max
        return df

In [21]:
df_ndvi_new = df_ndvi.copy()
# df_ndvi_new = df_ndvi.drop(['URBAN_RURA'], axis=1)
year = 2000
for col in df_ndvi_new.columns[3:]:
    if str(year) in col:
        continue
    year+=1
df_ndvi_new.columns[:16]

Index(['URBAN_RURA', 'latnum', 'longnum', 'v001', 'N2000_1', 'N2000_2',
       'N2000_3', 'N2000_4', 'N2000_5', 'N2000_6', 'N2000_7', 'N2000_8',
       'N2000_9', 'N2000_10', 'N2000_11', 'N2000_12'],
      dtype='object')

In [22]:
ser_int_lst = []
for index, row in df_ndvi_new.iterrows():
    year = 2000
    for i in range(20):
        s_year = str(year)
        ser = {'year':str(year),
            'latnum':row['latnum'],
          'longnum':row['longnum'],
            'URBAN_RURA':row['URBAN_RURA'],
          'ndvi_1': row[str('N'+s_year+'_1')],
          'ndvi_2': row[str('N'+s_year+'_2')],
          'ndvi_3': row[str('N'+s_year+'_3')],
          'ndvi_4': row[str('N'+s_year+'_4')],
          'ndvi_5': row[str('N'+s_year+'_5')],
          'ndvi_6': row[str('N'+s_year+'_6')],
          'ndvi_7': row[str('N'+s_year+'_7')],
          'ndvi_8': row[str('N'+s_year+'_8')],
          'ndvi_9': row[str('N'+s_year+'_9')],
          'ndvi_10': row[str('N'+s_year+'_10')],
          'ndvi_11': row[str('N'+s_year+'_11')],
          'ndvi_12': row[str('N'+s_year+'_12')]}
        ser_int_lst.append(ser)
        year+=1
    
print(len(ser_int_lst))
new_df_ndvi = pd.DataFrame(ser_int_lst)
new_df_ndvi.year = new_df_ndvi.year.astype(int)
new_df_ndvi.head()

567900


Unnamed: 0,year,latnum,longnum,URBAN_RURA,ndvi_1,ndvi_2,ndvi_3,ndvi_4,ndvi_5,ndvi_6,ndvi_7,ndvi_8,ndvi_9,ndvi_10,ndvi_11,ndvi_12
0,2000,11.670645,92.745799,Urban,0.616221,0.616221,0.617893,0.633412,0.647483,0.636767,0.690628,0.716299,0.724451,0.712941,0.730446,0.683256
1,2001,11.670645,92.745799,Urban,0.653911,0.633402,0.61386,0.603865,0.657769,0.642746,0.765922,0.708227,0.707478,0.725068,0.719063,0.715474
2,2002,11.670645,92.745799,Urban,0.652657,0.584416,0.554374,0.543098,0.613778,0.719132,0.650398,0.686398,0.730243,0.735867,0.722453,0.706416
3,2003,11.670645,92.745799,Urban,0.690942,0.588337,0.57929,0.629664,0.669009,0.660845,0.574167,0.65249,0.710788,0.772414,0.723005,0.69623
4,2004,11.670645,92.745799,Urban,0.658279,0.652468,0.609653,0.568996,0.55584,0.65543,0.696175,0.636196,0.681617,0.72774,0.746894,0.712562


In [23]:
tol = 1e-6
new_df_mal_i1 = new_df_mal.copy()

cols = ['ndvi_'+str(i) for i in range(13) if i>0]
col_vals = [0  for _ in range(12)]
new_df_mal_i1[cols] = col_vals
new_df_mal_i1.dropna(inplace=True)
try:
    for i, row in new_df_mal_i1.iterrows():
        mask_lat = np.isclose(new_df_ndvi.latnum, row.latnum, rtol=tol, atol=tol)
        mask_long = np.isclose(new_df_ndvi.longnum, row.longnum, rtol=tol, atol=tol)
        location_condition = mask_lat & mask_long

        # new_df_mal_i1[list(new_df_mal_i1.year == str(int(row.year))) & location_condition]
        ndvi_row = new_df_ndvi[(new_df_ndvi.year == row.year) & location_condition]
        row.iloc[7] = ndvi_row['ndvi_1']
        row.iloc[8] = ndvi_row['ndvi_2']
        row.iloc[9] = ndvi_row['ndvi_3']
        row.iloc[10] = ndvi_row['ndvi_4']
        row.iloc[11] = ndvi_row['ndvi_5']
        row.iloc[12] = ndvi_row['ndvi_6']
        row.iloc[13] = ndvi_row['ndvi_7']
        row.iloc[14] = ndvi_row['ndvi_8']
        row.iloc[15] = ndvi_row['ndvi_9']
        row.iloc[16] = ndvi_row['ndvi_10']
        row.iloc[17] = ndvi_row['ndvi_11']
        row.iloc[18] = ndvi_row['ndvi_12']
        new_df_mal_i1.iloc[i] = row
except:
    print('ndvi reshaped')
new_df_mal_i1.head()

ndvi reshaped


Unnamed: 0,year,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,ndvi_4,ndvi_5,ndvi_6,ndvi_7,ndvi_8,ndvi_9,ndvi_10,ndvi_11,ndvi_12
0,2000,11.670645,92.745799,Urban,0.101909,India,South Andaman,0.616221,0.616221,0.617893,0.633412,0.647483,0.636767,0.690628,0.716299,0.724451,0.712941,0.730446,0.683256
1,2001,11.670645,92.745799,Urban,0.082865,India,South Andaman,0.653911,0.633402,0.61386,0.603865,0.657769,0.642746,0.765922,0.708227,0.707478,0.725068,0.719063,0.715474
2,2002,11.670645,92.745799,Urban,0.077327,India,South Andaman,0.652657,0.584416,0.554374,0.543098,0.613778,0.719132,0.650398,0.686398,0.730243,0.735867,0.722453,0.706416
3,2003,11.670645,92.745799,Urban,0.077326,India,South Andaman,0.690942,0.588337,0.57929,0.629664,0.669009,0.660845,0.574167,0.65249,0.710788,0.772414,0.723005,0.69623
4,2004,11.670645,92.745799,Urban,0.087886,India,South Andaman,0.658279,0.652468,0.609653,0.568996,0.55584,0.65543,0.696175,0.636196,0.681617,0.72774,0.746894,0.712562


In [24]:
new_df_mal_i1.shape

(511110, 19)

In [25]:
new_df_mal_i1.columns

Index(['year', 'latnum', 'longnum', 'URBAN_RURA', 'pfir', 'country',
       'dhsregna', 'ndvi_1', 'ndvi_2', 'ndvi_3', 'ndvi_4', 'ndvi_5', 'ndvi_6',
       'ndvi_7', 'ndvi_8', 'ndvi_9', 'ndvi_10', 'ndvi_11', 'ndvi_12'],
      dtype='object')

In [26]:
df_t1 = new_df_mal_i1.copy()
# df_t1 = df_t1.iloc[:18,:]
df_t1.index = df_t1.year
df_t1.drop('year', axis = 1, inplace = True)
print(df_t1.shape)
df_t1.iloc[:5]

(511110, 18)


Unnamed: 0_level_0,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,ndvi_4,ndvi_5,ndvi_6,ndvi_7,ndvi_8,ndvi_9,ndvi_10,ndvi_11,ndvi_12
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2000,11.670645,92.745799,Urban,0.101909,India,South Andaman,0.616221,0.616221,0.617893,0.633412,0.647483,0.636767,0.690628,0.716299,0.724451,0.712941,0.730446,0.683256
2001,11.670645,92.745799,Urban,0.082865,India,South Andaman,0.653911,0.633402,0.61386,0.603865,0.657769,0.642746,0.765922,0.708227,0.707478,0.725068,0.719063,0.715474
2002,11.670645,92.745799,Urban,0.077327,India,South Andaman,0.652657,0.584416,0.554374,0.543098,0.613778,0.719132,0.650398,0.686398,0.730243,0.735867,0.722453,0.706416
2003,11.670645,92.745799,Urban,0.077326,India,South Andaman,0.690942,0.588337,0.57929,0.629664,0.669009,0.660845,0.574167,0.65249,0.710788,0.772414,0.723005,0.69623
2004,11.670645,92.745799,Urban,0.087886,India,South Andaman,0.658279,0.652468,0.609653,0.568996,0.55584,0.65543,0.696175,0.636196,0.681617,0.72774,0.746894,0.712562


In [27]:
df_ntl_1 = df_ntl.drop(['NL1992','NL1993','NL1994','NL1995','NL1996','NL1997','NL1998','NL1999'], axis=1)
df_ntl_1.columns

Index(['URBAN_RURA', 'latnum', 'longnum', 'NL2000', 'NL2001', 'NL2002',
       'NL2003', 'NL2004', 'NL2005', 'NL2006', 'NL2007', 'NL2008', 'NL2009',
       'NL2010', 'NL2011', 'NL2012', 'NL2013'],
      dtype='object')

In [28]:
ser_lst = []
for i, row in df_ntl_1.iterrows():
    year=2000
    latnum  = row['latnum']
    longnum = row['longnum']
    URBAN_RURA = row['URBAN_RURA']
    
    ntl_vals = row[2:]
    for ntl in ntl_vals:
        ser = {
            'latnum':latnum,
            'longnum':longnum,
            'URBAN_RURA': URBAN_RURA,
            'NL':ntl,
            'year': year
        }
        year+=1
        if year== 2020:
            year=2000
        ser_lst.append(ser)
df_ntl_2 = pd.DataFrame(ser_lst, columns = ['latnum','longnum','URBAN_RURA', 'NL','year'])
df_ntl_2.head()

Unnamed: 0,latnum,longnum,URBAN_RURA,NL,year
0,11.670645,92.745799,Urban,92.745799,2000
1,11.670645,92.745799,Urban,20.3951,2001
2,11.670645,92.745799,Urban,22.9259,2002
3,11.670645,92.745799,Urban,26.2963,2003
4,11.670645,92.745799,Urban,18.8025,2004


In [29]:
df_ntl_2.shape

(425925, 5)

In [30]:
new_df_mal_i1['ntl'] = 0
df_ntl_2.dropna(inplace=True)
try:
    for i, row in df_ntl_2.iterrows():
        mask_lat = np.isclose(new_df_mal_i1.latnum, row.latnum, rtol=tol, atol=tol)
        mask_long = np.isclose(new_df_mal_i1.longnum, row.longnum, rtol=tol, atol=tol)
        location_condition = mask_lat & mask_long

        # new_df_mal_i1[list(new_df_mal_i1.year == str(int(row.year))) & location_condition]
        ntl_row = new_df_mal_i1[(new_df_mal_i1.year == row.year) & location_condition]
        ntl_val = row.loc['NL']
        new_df_mal_i1.at[i,'ntl'] = ntl_val
    new_df_mal_i1.head()
except:
    print('ntl reshaped')

In [31]:
# Preparing temperature Table according to our needs
df_temperature_copy = df_temperature.copy()
columns = df_temperature_copy.columns

# Removing Columns that are not needed
df_temperature_copy.drop(columns[4:20*12+4], axis = 1, inplace=True)
df_temperature_copy.drop(columns[-2*12:-1], axis = 1, inplace=True)
df_temperature_copy.drop(columns[-1], axis = 1, inplace=True)
df_temperature_copy.drop(columns[[0,3]], axis = 1, inplace=True)

print(len(df_temperature_copy.columns))
temperature_lst = []
year = 2000
for i, row in df_temperature_copy.iterrows():
    for years_passed in range(18):
        base_year = str(year+years_passed)
        base_col_num = 't'+ base_year +'_'
        ser = {
            'year': base_year,
            'latnum': row[0],
            'longnum':row[1],
            't_1':row[base_col_num + '1'],
            't_2':row[base_col_num + '2'],
            't_3':row[base_col_num + '3'],
            't_4':row[base_col_num + '4'],
            't_5':row[base_col_num + '5'],
            't_6':row[base_col_num + '6'],
            't_7':row[base_col_num + '7'],
            't_8':row[base_col_num + '8'],
            't_9':row[base_col_num + '9'],
            't_10':row[base_col_num + '10'],
            't_11':row[base_col_num + '11'],
            't_12':row[base_col_num + '12']
        }
        temperature_lst.append(ser)
    
df_reshaped_temp = pd.DataFrame(temperature_lst)
df_reshaped_temp.head()

218


Unnamed: 0,year,latnum,longnum,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,t_10,t_11,t_12
0,2000,11.670645,92.745799,26.4,26.0,26.8,26.9,27.6,27.0,27.3,26.9,27.1,26.9,27.5,27.4
1,2001,11.670645,92.745799,26.5,26.3,27.3,28.8,27.2,27.5,27.4,27.1,27.2,27.2,26.9,27.1
2,2002,11.670645,92.745799,26.5,26.9,27.6,28.6,27.8,27.4,27.2,27.0,26.6,27.4,27.3,27.2
3,2003,11.670645,92.745799,26.2,27.2,27.7,28.5,28.5,27.8,26.8,27.5,27.2,27.3,28.5,27.3
4,2004,11.670645,92.745799,27.3,26.5,27.7,29.3,27.7,27.4,27.2,27.5,27.3,27.5,28.2,27.5


In [32]:
new_df_mal_i1[df_reshaped_temp.columns[3:]] = 0
new_df_mal_i1.head()

Unnamed: 0,year,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,...,t_3,t_4,t_5,t_6,t_7,t_8,t_9,t_10,t_11,t_12
0,2000.0,11.670645,92.745799,Urban,0.101909,India,South Andaman,0.616221,0.616221,0.617893,...,0,0,0,0,0,0,0,0,0,0
1,2001.0,11.670645,92.745799,Urban,0.082865,India,South Andaman,0.653911,0.633402,0.61386,...,0,0,0,0,0,0,0,0,0,0
2,2002.0,11.670645,92.745799,Urban,0.077327,India,South Andaman,0.652657,0.584416,0.554374,...,0,0,0,0,0,0,0,0,0,0
3,2003.0,11.670645,92.745799,Urban,0.077326,India,South Andaman,0.690942,0.588337,0.57929,...,0,0,0,0,0,0,0,0,0,0
4,2004.0,11.670645,92.745799,Urban,0.087886,India,South Andaman,0.658279,0.652468,0.609653,...,0,0,0,0,0,0,0,0,0,0


In [33]:
new_df_mal_i1.dropna(inplace=True)

In [34]:
print(new_df_mal_i1.shape)
new_df_mal_i1.dropna(inplace=True)
try:
    for i, row in new_df_mal_i1.iterrows():
        mask_lat = np.isclose(df_reshaped_temp.latnum, row.latnum, rtol=tol, atol=tol)
        mask_long = np.isclose(df_reshaped_temp.longnum, row.longnum, rtol=tol, atol=tol)
        location_condition = mask_lat & mask_long

        temp_row = df_reshaped_temp[(df_reshaped_temp.year==str(int(row['year']))) & location_condition]
        # print(temp_row.iloc[i,3:])
        new_df_mal_i1.iloc[i,20:] = temp_row.iloc[0, 3:]
except:
    print('Temperature reshaped')
display(new_df_mal_i1.head())

(511110, 32)
Temperature reshaped


Unnamed: 0,year,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,...,t_3,t_4,t_5,t_6,t_7,t_8,t_9,t_10,t_11,t_12
0,2000.0,11.670645,92.745799,Urban,0.101909,India,South Andaman,0.616221,0.616221,0.617893,...,26.8,26.9,27.6,27.0,27.3,26.9,27.1,26.9,27.5,27.4
1,2001.0,11.670645,92.745799,Urban,0.082865,India,South Andaman,0.653911,0.633402,0.61386,...,27.3,28.8,27.2,27.5,27.4,27.1,27.2,27.2,26.9,27.1
2,2002.0,11.670645,92.745799,Urban,0.077327,India,South Andaman,0.652657,0.584416,0.554374,...,27.6,28.6,27.8,27.4,27.2,27.0,26.6,27.4,27.3,27.2
3,2003.0,11.670645,92.745799,Urban,0.077326,India,South Andaman,0.690942,0.588337,0.57929,...,27.7,28.5,28.5,27.8,26.8,27.5,27.2,27.3,28.5,27.3
4,2004.0,11.670645,92.745799,Urban,0.087886,India,South Andaman,0.658279,0.652468,0.609653,...,27.7,29.3,27.7,27.4,27.2,27.5,27.3,27.5,28.2,27.5


In [35]:
new_df_mal_i1.shape

(511110, 32)

In [36]:
df_rnfall = df_rainfall.copy()
df_rnfall.head()

Unnamed: 0,country,country_code,end_year,dhsid,dhsregco,dhsregna,dhsyear,URBAN_RURA,latnum,longnum,...,p2019_3,p2019_4,p2019_5,p2019_6,p2019_7,p2019_8,p2019_9,p2019_10,p2019_11,p2019_12
0,India,IA,2016,IA201400010001,640,South Andaman,2014,Urban,11.670645,92.745799,...,108.6,18.8,227.9,720.5,143.7,428.0,651.4,162.6,152.9,1.7
1,India,IA,2016,IA201400010002,638,Nicobars,2014,Rural,9.143844,92.826752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,India,IA,2016,IA201400010003,638,Nicobars,2014,Rural,9.212246,92.753633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,India,IA,2016,IA201400010004,638,Nicobars,2014,Rural,9.165413,92.742696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,India,IA,2016,IA201400010005,638,Nicobars,2014,Rural,8.307356,93.093792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
df_rnfall.drop(['country', 'country_code', 'end_year', 'dhsid', 'dhsregna', 'v001', 'dhsregco','dhsyear', 'URBAN_RURA'], axis=1,inplace=True)
# Preparing Rainfall Table according to our needs
columns = df_rnfall.columns

# Removing Columns that are not needed
df_rnfall.drop(columns[2:20*12+2], axis = 1, inplace=True)
df_rnfall.drop(columns[-2*12:-1], axis = 1, inplace=True)
df_rnfall.drop(columns[-1], axis = 1, inplace=True)

print(len(df_rnfall.columns))
df_rnfall.head()

218


Unnamed: 0,latnum,longnum,p2000_1,p2000_2,p2000_3,p2000_4,p2000_5,p2000_6,p2000_7,p2000_8,...,p2017_3,p2017_4,p2017_5,p2017_6,p2017_7,p2017_8,p2017_9,p2017_10,p2017_11,p2017_12
0,11.670645,92.745799,36.1,182.2,20.0,219.7,446.5,388.2,238.7,556.9,...,9.7,155.4,215.5,563.7,562.5,530.0,632.0,241.5,177.7,167.4
1,9.143844,92.826752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.212246,92.753633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.165413,92.742696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.307356,93.093792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df_rnfall.columns

Index(['latnum', 'longnum', 'p2000_1', 'p2000_2', 'p2000_3', 'p2000_4',
       'p2000_5', 'p2000_6', 'p2000_7', 'p2000_8',
       ...
       'p2017_3', 'p2017_4', 'p2017_5', 'p2017_6', 'p2017_7', 'p2017_8',
       'p2017_9', 'p2017_10', 'p2017_11', 'p2017_12'],
      dtype='object', length=218)

In [39]:
rf_lst = []
year = 2000
for i, row in df_rnfall.iterrows():
    for years_passed in range(18):
        base_year = str(year+years_passed)
        base_col_num = 'p'+ base_year +'_'
        ser = {
            'year': base_year,
            'latnum': row[0],
            'longnum':row[1],
            'rf_1':row[base_col_num + '1'],
            'rf_2':row[base_col_num + '2'],
            'rf_3':row[base_col_num + '3'],
            'rf_4':row[base_col_num + '4'],
            'rf_5':row[base_col_num + '5'],
            'rf_6':row[base_col_num + '6'],
            'rf_7':row[base_col_num + '7'],
            'rf_8':row[base_col_num + '8'],
            'rf_9':row[base_col_num + '9'],
            'rf_10':row[base_col_num + '10'],
            'rf_11':row[base_col_num + '11'],
            'rf_12':row[base_col_num + '12']
        }
        rf_lst.append(ser)
df_reshaped_rf = pd.DataFrame(rf_lst)
df_reshaped_rf.head()

Unnamed: 0,year,latnum,longnum,rf_1,rf_2,rf_3,rf_4,rf_5,rf_6,rf_7,rf_8,rf_9,rf_10,rf_11,rf_12
0,2000,11.670645,92.745799,36.1,182.2,20.0,219.7,446.5,388.2,238.7,556.9,313.3,282.6,129.2,66.0
1,2001,11.670645,92.745799,135.5,10.7,20.0,16.7,898.2,229.7,316.4,441.7,524.0,272.5,108.1,65.5
2,2002,11.670645,92.745799,0.0,0.0,27.2,89.7,415.7,359.2,465.3,286.0,511.9,89.3,229.0,101.1
3,2003,11.670645,92.745799,24.4,0.0,20.0,13.6,301.3,227.7,553.2,343.4,509.5,272.5,52.2,57.2
4,2004,11.670645,92.745799,115.5,55.4,1.9,35.5,632.2,608.2,414.6,333.7,522.8,253.7,270.2,143.0


In [40]:
new_df_mal_i1[df_reshaped_rf.columns[3:]] = 0
new_df_mal_i1.head()

Unnamed: 0,year,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,...,rf_3,rf_4,rf_5,rf_6,rf_7,rf_8,rf_9,rf_10,rf_11,rf_12
0,2000.0,11.670645,92.745799,Urban,0.101909,India,South Andaman,0.616221,0.616221,0.617893,...,0,0,0,0,0,0,0,0,0,0
1,2001.0,11.670645,92.745799,Urban,0.082865,India,South Andaman,0.653911,0.633402,0.61386,...,0,0,0,0,0,0,0,0,0,0
2,2002.0,11.670645,92.745799,Urban,0.077327,India,South Andaman,0.652657,0.584416,0.554374,...,0,0,0,0,0,0,0,0,0,0
3,2003.0,11.670645,92.745799,Urban,0.077326,India,South Andaman,0.690942,0.588337,0.57929,...,0,0,0,0,0,0,0,0,0,0
4,2004.0,11.670645,92.745799,Urban,0.087886,India,South Andaman,0.658279,0.652468,0.609653,...,0,0,0,0,0,0,0,0,0,0


In [41]:
print(new_df_mal_i1.shape)
new_df_mal_i1.dropna(inplace=True)
try:
    for i, row in new_df_mal_i1.iterrows():
        mask_lat = np.isclose(df_reshaped_rf.latnum, row.latnum, rtol=tol, atol=tol)
        mask_long = np.isclose(df_reshaped_rf.longnum, row.longnum, rtol=tol, atol=tol)
        location_condition = mask_lat & mask_long

        temp_row = df_reshaped_rf[(df_reshaped_rf.year==str(int(row['year']))) & location_condition]
        # print(temp_row.iloc[i,3:])
        new_df_mal_i1.iloc[i,32:] = temp_row.iloc[0, 3:]
    display(new_df_mal_i1.head())
except:
    print('Rainfall Reshaped')

(511110, 44)
Rainfall Reshaped


In [42]:
new_df_mal_i1.dhsregna.unique


<bound method Series.unique of 0         South Andaman
1         South Andaman
2         South Andaman
3         South Andaman
4         South Andaman
              ...      
513463       Rangareddy
513464       Rangareddy
513465       Rangareddy
513466       Rangareddy
513467       Rangareddy
Name: dhsregna, Length: 511110, dtype: object>

In [43]:
new_df_mal_i1.index = new_df_mal_i1.year
new_df_mal_i1.drop('year', axis=1, inplace = True)

In [44]:
new_df_mal_i1.dropna(inplace=True)

encoder_urban_rural = LabelEncoder()
encoder_dhsregna = LabelEncoder()
encoder_country = LabelEncoder()

new_df_mal_i1.URBAN_RURA = encoder_urban_rural.fit_transform(new_df_mal_i1.URBAN_RURA)
new_df_mal_i1.dhsregna = encoder_dhsregna.fit_transform(new_df_mal_i1.dhsregna)
new_df_mal_i1.country = encoder_dhsregna.fit_transform(new_df_mal_i1.country)


In [45]:
new_df_mal_i1.head()

Unnamed: 0_level_0,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,ndvi_4,...,rf_3,rf_4,rf_5,rf_6,rf_7,rf_8,rf_9,rf_10,rf_11,rf_12
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,11.670645,92.745799,1,0.101909,0,546,0.616221,0.616221,0.617893,0.633412,...,20.0,219.7,446.5,388.2,238.7,556.9,313.3,282.6,129.2,66.0
2001.0,11.670645,92.745799,1,0.082865,0,546,0.653911,0.633402,0.61386,0.603865,...,20.0,16.7,898.2,229.7,316.4,441.7,524.0,272.5,108.1,65.5
2002.0,11.670645,92.745799,1,0.077327,0,546,0.652657,0.584416,0.554374,0.543098,...,27.2,89.7,415.7,359.2,465.3,286.0,511.9,89.3,229.0,101.1
2003.0,11.670645,92.745799,1,0.077326,0,546,0.690942,0.588337,0.57929,0.629664,...,20.0,13.6,301.3,227.7,553.2,343.4,509.5,272.5,52.2,57.2
2004.0,11.670645,92.745799,1,0.087886,0,546,0.658279,0.652468,0.609653,0.568996,...,1.9,35.5,632.2,608.2,414.6,333.7,522.8,253.7,270.2,143.0


In [46]:
new_df_mal_i1.iloc[:5,:]

Unnamed: 0_level_0,latnum,longnum,URBAN_RURA,pfir,country,dhsregna,ndvi_1,ndvi_2,ndvi_3,ndvi_4,...,rf_3,rf_4,rf_5,rf_6,rf_7,rf_8,rf_9,rf_10,rf_11,rf_12
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000.0,11.670645,92.745799,1,0.101909,0,546,0.616221,0.616221,0.617893,0.633412,...,20.0,219.7,446.5,388.2,238.7,556.9,313.3,282.6,129.2,66.0
2001.0,11.670645,92.745799,1,0.082865,0,546,0.653911,0.633402,0.61386,0.603865,...,20.0,16.7,898.2,229.7,316.4,441.7,524.0,272.5,108.1,65.5
2002.0,11.670645,92.745799,1,0.077327,0,546,0.652657,0.584416,0.554374,0.543098,...,27.2,89.7,415.7,359.2,465.3,286.0,511.9,89.3,229.0,101.1
2003.0,11.670645,92.745799,1,0.077326,0,546,0.690942,0.588337,0.57929,0.629664,...,20.0,13.6,301.3,227.7,553.2,343.4,509.5,272.5,52.2,57.2
2004.0,11.670645,92.745799,1,0.087886,0,546,0.658279,0.652468,0.609653,0.568996,...,1.9,35.5,632.2,608.2,414.6,333.7,522.8,253.7,270.2,143.0


#### Converting to Time series data

In [47]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    print(int(data.shape[0]/18))
    data['next_pfir'] = data['pfir']
    
    return data

In [48]:
# Returns number of batches for the total data provided to the function
def getNoOfBatches(batch_size, dataset_size):
    return int(dataset_size/batch_size)
# Returns two data frames X:(Features of all Examples reshaped into batches) and Y:(Target of all Examples turned into batches)
def getBatches(df:pd.DataFrame, batch_size=1):
    df_ref = df.values

    Y = df_ref[:, -1]
    Y = Y.reshape(Y.shape[0], 1)
    X = df_ref[:, :df_ref.shape[1]-1]
    # Here batch size indicates the number of timestamps/ No. of years
    no_batches = getNoOfBatches(batch_size = batch_size, dataset_size = X.shape[0])

    X = X.reshape(no_batches, batch_size, X.shape[1])
    Y = Y.reshape(no_batches, batch_size, Y.shape[1])
    return X, Y

In [49]:
df_reframed = new_df_mal_i1.copy()
# Converting the Series Data into Supervised Time Series
df_reframed = series_to_supervised(new_df_mal_i1)

28395


In [50]:
df_reframed.to_csv(f'./data/reshaped_data/{country}_reframed.csv', header=True, index=True)