In [45]:
import pandas as pd
import sklearn
f = pd.read_csv("daily_weather_raw.csv", sep=',', parse_dates=True, infer_datetime_format=True)
f.head(2)

Unnamed: 0,date,air_pressure_DAYMAX,air_temp_DAYMAX,avg_wind_speed_DAYMAX,max_wind_speed_DAYMAX,rain_accumulation_DAYMAX,rain_duration_DAYMAX,relative_humidity_DAYMAX,air_pressure_DAYMIN,air_temp_DAYMIN,...,relative_humidity_9am,air_pressure_3pm,air_temp_3pm,avg_wind_direction_3pm,avg_wind_speed_3pm,max_wind_direction_3pm,max_wind_speed_3pm,rain_accumulation_3pm,rain_duration_3pm,relative_humidity_3pm
0,2011-09-10,914.9,68.0,8.9,9.3,0.0,0.0,90.8,911.1,52.7,...,86.27,913.05,61.394,211.9,5.83,219.0,6.92,0.0,0.0,69.9
1,2011-09-11,918.2,71.6,6.1,7.1,0.0,0.0,91.2,914.5,51.98,...,77.48,916.5,70.25,233.7,2.49,252.8,3.23,0.0,0.0,49.09


In [46]:
f[['relative_humidity_DAYMAX','relative_humidity_9am','relative_humidity_DAYMIN','relative_humidity_3pm']].head(1)

Unnamed: 0,relative_humidity_DAYMAX,relative_humidity_9am,relative_humidity_DAYMIN,relative_humidity_3pm
0,90.8,86.27,29.0,69.9


In [47]:
def functionG(row):
    if row['relative_humidity_3pm'] < 25:
        val = 1
    elif row['relative_humidity_3pm'] > 25:
        val = 0
    else:
        val = 1
    return val

In [48]:
f['label']=f.apply(functionG, axis=1)

In [49]:
f.shape

(1097, 34)

In [50]:
#REMOVE HUMIDITY
relative_humidity_3pm = f['relative_humidity_3pm'].copy(deep=True)
cols = [c for c in f.columns if ('humidity' in c.lower())]
for c in cols:
    del f[c]
f.shape

(1097, 30)

In [51]:
#REMOVE DAY features
cols = [c for c in f.columns if ('day' in c.lower())]
for c in cols:
    del f[c]
f.shape

(1097, 18)

In [52]:
#REMOVE 3pm features
cols = [c for c in f.columns if ('3pm' in c.lower())]
for c in cols:
    del f[c]
f.shape

(1097, 10)

In [53]:
f.columns

Index(['date', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'label'],
      dtype='object')

In [54]:
p = f.dropna()
f.shape, p.shape

((1097, 10), (1077, 10))

In [55]:
label1 = f.dropna().loc[f['label'] == 1]
label0 = f.dropna().loc[f['label'] == 0]
label0.shape, label1.shape

((876, 10), (201, 10))

In [56]:
label0 =label0.sample(547, random_state=42)

f=label0.append(label1)
f.reset_index(inplace=True)
del f['index']

f.shape

(748, 10)

In [60]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(ratio = 'auto', kind = 'regular', random_state=12)

lastcolumn = f.shape[1]

features = f.columns[1:lastcolumn-1]
target = f.columns[lastcolumn-1]

x=f[features].copy(deep=True)
y=f[target].copy(deep=True)

X, Y = smote.fit_sample(x, y)

In [66]:
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

In [67]:
X.columns = features
Y.columns = ['label']

In [68]:
X.columns, Y.columns

(Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
        'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
        'rain_accumulation_9am', 'rain_duration_9am'],
       dtype='object'), Index(['label'], dtype='object'))

In [69]:
Y.shape, Y.sum()

((1094, 1), label    547
 dtype: int64)

In [70]:
########################
# RANDOM SPLIT
########################

from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.40, random_state=30)

In [71]:
Y.sum(), Y_test.sum(), Y_train.sum()

(label    547
 dtype: int64, label    223
 dtype: int64, label    324
 dtype: int64)

## Decision Tree Classifier

In [72]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, Y_train)
Z = clf.predict(X_test)

In [73]:
Z.sum(), Y_test.sum(), Y_test.shape[0]-Y_test.sum()

(209, label    223
 dtype: int64, label    215
 dtype: int64)

In [74]:
from sklearn.metrics import classification_report
# 1 means LOW humidity
print(classification_report(Y_test, Z))

             precision    recall  f1-score   support

          0       0.79      0.84      0.81       215
          1       0.83      0.78      0.81       223

avg / total       0.81      0.81      0.81       438



In [75]:
sklearn.metrics.accuracy_score(Y_test, Z, normalize = True)

0.80821917808219179

## Naive Bayes

In [76]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_classifier = gnb.fit(X_train, Y_train)
Z_NaiveBayes = gnb_classifier.predict(X_test)

  y = column_or_1d(y, warn=True)


In [77]:
# 1 means LOW humidity
print(classification_report(Y_test, Z_NaiveBayes))

             precision    recall  f1-score   support

          0       0.95      0.16      0.28       215
          1       0.55      0.99      0.71       223

avg / total       0.74      0.58      0.50       438



In [78]:
sklearn.metrics.accuracy_score(Y_test, Z_NaiveBayes, normalize = True)

0.58447488584474883

## Writing data to csv

In [79]:
df = X.copy(deep=True)
df['relative_humidity_3pm'] = relative_humidity_3pm

In [81]:
# shuffle
import numpy as np
df = df.iloc[np.random.permutation(len(df))]
df = df.reset_index(drop=True)

In [82]:
import random

ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1]-1)]

for row, col in random.sample(ix, int(31)):
    df.iat[row, col] = np.nan

In [83]:
df.to_csv("daily_weather.csv")