# Prediction of Lightning in India using ML

Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
df=pd.read_csv('train.csv') # train consists of lightning data from different districts of UP

Wind Data accessed from https://power.larc.nasa.gov/data-access-viewer/ and lighting labelled using https://lightning.nsstc.nasa.gov/nlisib/nlissearch.html

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,YEAR,T2M,T2MDEW,T2MWET,QV2M,RH2M,PRECTOTCORR,PS,WS10M,WD10M,WS50M,WD50M,Location,timestamp,lightning
0,0,2001,13.83,10.39,12.12,7.87,79.56,0.13,99.87,2.82,96.83,5.23,96.43,Banda,2001-01-01 00:00:00,0
1,1,2001,13.73,10.64,12.19,8.0,81.44,0.12,99.83,2.64,107.9,4.69,107.05,Banda,2001-01-01 01:00:00,0
2,2,2001,13.55,10.83,12.19,8.12,83.5,0.22,99.82,2.72,119.81,4.66,118.97,Banda,2001-01-01 02:00:00,0
3,3,2001,13.37,10.97,12.17,8.18,85.19,0.1,99.8,3.02,129.96,4.91,129.44,Banda,2001-01-01 03:00:00,0
4,4,2001,13.27,11.1,12.19,8.24,86.5,0.04,99.82,3.09,139.82,4.91,139.45,Banda,2001-01-01 04:00:00,0


- T2M             MERRA-2 Temperature at 2 Meters (C) 
- T2MDEW          MERRA-2 Dew/Frost Point at 2 Meters (C) 
- T2MWET          MERRA-2 Wet Bulb Temperature at 2 Meters (C) 
- QV2M            MERRA-2 Specific Humidity at 2 Meters (g/kg) 
- RH2M            MERRA-2 Relative Humidity at 2 Meters (%) 
- PRECTOTCORR     MERRA-2 Precipitation Corrected (mm/hour) 
- PS              MERRA-2 Surface Pressure (kPa) 
- WS10M           MERRA-2 Wind Speed at 10 Meters (m/s) 
- WD10M           MERRA-2 Wind Direction at 10 Meters (Degrees) 
- WS50M           MERRA-2 Wind Speed at 50 Meters (m/s) 
- WD50M           MERRA-2 Wind Direction at 50 Meters (Degrees) 

In [4]:
df.drop(['YEAR'], axis = 1, inplace= True)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,T2M,T2MDEW,T2MWET,QV2M,RH2M,PRECTOTCORR,PS,WS10M,WD10M,WS50M,WD50M,Location,timestamp,lightning
0,0,13.83,10.39,12.12,7.87,79.56,0.13,99.87,2.82,96.83,5.23,96.43,Banda,2001-01-01 00:00:00,0
1,1,13.73,10.64,12.19,8.0,81.44,0.12,99.83,2.64,107.9,4.69,107.05,Banda,2001-01-01 01:00:00,0
2,2,13.55,10.83,12.19,8.12,83.5,0.22,99.82,2.72,119.81,4.66,118.97,Banda,2001-01-01 02:00:00,0
3,3,13.37,10.97,12.17,8.18,85.19,0.1,99.8,3.02,129.96,4.91,129.44,Banda,2001-01-01 03:00:00,0
4,4,13.27,11.1,12.19,8.24,86.5,0.04,99.82,3.09,139.82,4.91,139.45,Banda,2001-01-01 04:00:00,0


In [6]:
del df[df.columns[0]]

In [7]:
df.head()

Unnamed: 0,T2M,T2MDEW,T2MWET,QV2M,RH2M,PRECTOTCORR,PS,WS10M,WD10M,WS50M,WD50M,Location,timestamp,lightning
0,13.83,10.39,12.12,7.87,79.56,0.13,99.87,2.82,96.83,5.23,96.43,Banda,2001-01-01 00:00:00,0
1,13.73,10.64,12.19,8.0,81.44,0.12,99.83,2.64,107.9,4.69,107.05,Banda,2001-01-01 01:00:00,0
2,13.55,10.83,12.19,8.12,83.5,0.22,99.82,2.72,119.81,4.66,118.97,Banda,2001-01-01 02:00:00,0
3,13.37,10.97,12.17,8.18,85.19,0.1,99.8,3.02,129.96,4.91,129.44,Banda,2001-01-01 03:00:00,0
4,13.27,11.1,12.19,8.24,86.5,0.04,99.82,3.09,139.82,4.91,139.45,Banda,2001-01-01 04:00:00,0


In [8]:
x=df.iloc[:,:-1]

In [9]:
x.head()

Unnamed: 0,T2M,T2MDEW,T2MWET,QV2M,RH2M,PRECTOTCORR,PS,WS10M,WD10M,WS50M,WD50M,Location,timestamp
0,13.83,10.39,12.12,7.87,79.56,0.13,99.87,2.82,96.83,5.23,96.43,Banda,2001-01-01 00:00:00
1,13.73,10.64,12.19,8.0,81.44,0.12,99.83,2.64,107.9,4.69,107.05,Banda,2001-01-01 01:00:00
2,13.55,10.83,12.19,8.12,83.5,0.22,99.82,2.72,119.81,4.66,118.97,Banda,2001-01-01 02:00:00
3,13.37,10.97,12.17,8.18,85.19,0.1,99.8,3.02,129.96,4.91,129.44,Banda,2001-01-01 03:00:00
4,13.27,11.1,12.19,8.24,86.5,0.04,99.82,3.09,139.82,4.91,139.45,Banda,2001-01-01 04:00:00


In [10]:
y=df.iloc[:,13]

In [11]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: lightning, dtype: int64

Encoding of each Location as 0, 1, 2....

In [12]:
from sklearn.preprocessing import LabelEncoder  
label_encoder_x= LabelEncoder()  
x['Location']= label_encoder_x.fit_transform(x['Location'])

In [13]:
x.head()

Unnamed: 0,T2M,T2MDEW,T2MWET,QV2M,RH2M,PRECTOTCORR,PS,WS10M,WD10M,WS50M,WD50M,Location,timestamp
0,13.83,10.39,12.12,7.87,79.56,0.13,99.87,2.82,96.83,5.23,96.43,0,2001-01-01 00:00:00
1,13.73,10.64,12.19,8.0,81.44,0.12,99.83,2.64,107.9,4.69,107.05,0,2001-01-01 01:00:00
2,13.55,10.83,12.19,8.12,83.5,0.22,99.82,2.72,119.81,4.66,118.97,0,2001-01-01 02:00:00
3,13.37,10.97,12.17,8.18,85.19,0.1,99.8,3.02,129.96,4.91,129.44,0,2001-01-01 03:00:00
4,13.27,11.1,12.19,8.24,86.5,0.04,99.82,3.09,139.82,4.91,139.45,0,2001-01-01 04:00:00


Dropped the timestamp column

In [14]:
x.drop(['timestamp'], axis = 1, inplace= True)

Train Test Split

In [15]:
 from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.3, random_state=0)

In [16]:
from sklearn.preprocessing import StandardScaler

Standardization using StandardScaler()

In [17]:
st_x= StandardScaler()
x_train= st_x.fit_transform(x_train)

In [18]:
x_test= st_x.transform(x_test)

In [20]:
x_train

array([[-0.2587779 ,  0.10926209, -0.07489367, ..., -1.20168192,
        -1.69952236,  0.72505098],
       [ 0.5333751 , -0.38774811,  0.0459541 , ..., -0.58250696,
        -1.88034083,  0.14584471],
       [ 0.48207603,  1.23725026,  1.10886516, ..., -0.23702527,
         0.60437361, -0.14375842],
       ...,
       [ 1.87192278,  0.63592927,  1.50436695, ..., -0.59148051,
         0.62172771, -1.59177408],
       [-0.61309934,  0.7504666 ,  0.1516959 , ...,  0.85774786,
         0.27399271,  1.01465411],
       [-0.50811521, -0.03493223, -0.3165892 , ...,  0.70968428,
         1.323169  , -1.01256781]])

In [21]:
df_train = pd.DataFrame(x_train)

In [22]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.258778,0.109262,-0.074894,-0.180738,0.031542,-0.260513,1.267474,-1.10533,-1.68911,-1.201682,-1.699522,0.725051
1,0.533375,-0.387748,0.045954,-0.599523,-0.929377,-0.260513,0.631433,-0.152089,-1.856461,-0.582507,-1.880341,0.145845
2,0.482076,1.23725,1.108865,1.478777,0.738144,2.483736,-2.897818,0.165658,0.597059,-0.237025,0.604374,-0.143758
3,0.665798,1.300655,1.257178,1.517843,0.612266,0.820555,-0.950912,-1.167633,-0.794215,-1.349746,-0.82594,0.435448
4,0.546498,1.290428,1.180275,1.469402,0.742535,1.735304,-0.229549,-0.687897,0.283009,-1.058105,0.29116,0.145845


In [23]:
df2 = df_train.set_axis(['T2M', 'T2MDEW','T2MWET','QV2M','RH2M','PRECTOTCORR','PS','WS10M','WD10M','WS50M','WD50M','Location'], axis=1, inplace=True)

In [24]:
df_train.head()

Unnamed: 0,T2M,T2MDEW,T2MWET,QV2M,RH2M,PRECTOTCORR,PS,WS10M,WD10M,WS50M,WD50M,Location
0,-0.258778,0.109262,-0.074894,-0.180738,0.031542,-0.260513,1.267474,-1.10533,-1.68911,-1.201682,-1.699522,0.725051
1,0.533375,-0.387748,0.045954,-0.599523,-0.929377,-0.260513,0.631433,-0.152089,-1.856461,-0.582507,-1.880341,0.145845
2,0.482076,1.23725,1.108865,1.478777,0.738144,2.483736,-2.897818,0.165658,0.597059,-0.237025,0.604374,-0.143758
3,0.665798,1.300655,1.257178,1.517843,0.612266,0.820555,-0.950912,-1.167633,-0.794215,-1.349746,-0.82594,0.435448
4,0.546498,1.290428,1.180275,1.469402,0.742535,1.735304,-0.229549,-0.687897,0.283009,-1.058105,0.29116,0.145845


## EDA Report

In [25]:
profile = ProfileReport(df_train, title='Pandas Profiling Report', explorative=True)

In [26]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [27]:
profile.to_file("output.html")

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]