In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# IMPORT  WEATHER DATA

In [2]:
data = pd.read_csv('./testset.csv')
data.head()

Unnamed: 0,datetime_utc,_conds,_dewptm,_fog,_hail,_heatindexm,_hum,_precipm,_pressurem,_rain,_snow,_tempm,_thunder,_tornado,_vism,_wdird,_wdire,_wgustm,_windchillm,_wspdm
0,19961101-11:00,Smoke,9.0,0,0,,27.0,,1010.0,0,0,30.0,0,0,5.0,280.0,West,,,7.4
1,19961101-12:00,Smoke,10.0,0,0,,32.0,,-9999.0,0,0,28.0,0,0,,0.0,North,,,
2,19961101-13:00,Smoke,11.0,0,0,,44.0,,-9999.0,0,0,24.0,0,0,,0.0,North,,,
3,19961101-14:00,Smoke,10.0,0,0,,41.0,,1010.0,0,0,24.0,0,0,2.0,0.0,North,,,
4,19961101-16:00,Smoke,11.0,0,0,,47.0,,1011.0,0,0,23.0,0,0,1.2,0.0,North,,,0.0


In [3]:
df_info= pd.DataFrame({"Dtype": data.dtypes, "Unique": data.nunique(), "Missing%": (data.isnull().sum()/data.shape[0])*100})
df_info

Unnamed: 0,Dtype,Unique,Missing%
datetime_utc,object,100990,0.0
_conds,object,39,0.071294
_dewptm,float64,51,0.614912
_fog,int64,2,0.0
_hail,int64,2,0.0
_heatindexm,float64,193,71.130805
_hum,float64,100,0.749579
_precipm,float64,0,100.0
_pressurem,float64,139,0.229726
_rain,int64,2,0.0


# DATA PROCESSING

In [4]:
weather_data = data[[' _fog',' _hail',' _hum',' _pressurem',' _thunder',' _rain']]
new_weather_data = weather_data.dropna()
print('length before drop:{}\n lenght after drop:{}'.format(len(weather_data),len(new_weather_data)))

length before drop:100990
 lenght after drop:100003


# SELECT TARGET Y

In [5]:
y = new_weather_data[[' _rain']]
y.head()

Unnamed: 0,_rain
0,0
1,0
2,0
3,0
4,0


# SELECT FEATURES X

In [6]:
x = new_weather_data[[' _fog',' _hail',' _hum',' _pressurem',' _thunder']]
x.head()

Unnamed: 0,_fog,_hail,_hum,_pressurem,_thunder
0,0,0,27.0,1010.0,0
1,0,0,32.0,-9999.0,0
2,0,0,44.0,-9999.0,0
3,0,0,41.0,1010.0,0
4,0,0,47.0,1011.0,0


In [7]:
#split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=324)
print('train_data lenght:{}\ntest_data lenght:{}'.format(len(x_train),len(y_test)))

train_data lenght:67002
test_data lenght:33001


# TRAIN THE MODEL

In [8]:
rain_classifier = DecisionTreeClassifier(max_leaf_nodes=10,random_state=0)
rain_classifier.fit(x_train,y_train)

DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

# PREDICTION

In [9]:
predictions = rain_classifier.predict(x_test)
print('The predicted values:{}\n Y_test = {}'.format(predictions[:15],y_test[:15]))

The predicted values:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 Y_test =         _rain
15009       0
21650       0
76726       0
14008       0
23210       0
93936       0
17320       0
68947       0
65466       0
33222       0
26422       0
29386       0
72321       0
50983       0
24117       0


# TEST THE ACCURACY OF THE CLASSIFIER

In [10]:
accuracy_score(y_true=y_test,y_pred=predictions)

0.9791521469046393

# SELF DATA TEST

In [11]:
#create 2d test array
self_test = [[0,0,27.0,100.0,0]]
#predict it with our pretrained model
rain_classifier.predict(self_test)

array([0], dtype=int64)