In [1]:
import pandas as pd
import sklearn
f = pd.read_csv("daily_weather_raw.csv", sep=',', parse_dates=True, infer_datetime_format=True)
f.head(1)

Unnamed: 0,date,air_pressure_DAYMAX,air_temp_DAYMAX,avg_wind_speed_DAYMAX,max_wind_speed_DAYMAX,relative_humidity_DAYMAX,air_pressure_DAYMIN,air_temp_DAYMIN,avg_wind_speed_DAYMIN,max_wind_speed_DAYMIN,...,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,air_pressure_3pm,air_temp_3pm,avg_wind_direction_3pm,avg_wind_speed_3pm,max_wind_direction_3pm,max_wind_speed_3pm,relative_humidity_3pm
0,2011-09-11,918.2,71.6,6.1,7.1,91.2,914.5,51.98,0.1,0.1,...,0.0,0.0,77.48,916.5,70.25,233.7,2.49,252.8,3.23,49.09


In [2]:
f[['relative_humidity_DAYMAX','relative_humidity_9am','relative_humidity_DAYMIN','relative_humidity_3pm']].head(1)

Unnamed: 0,relative_humidity_DAYMAX,relative_humidity_9am,relative_humidity_DAYMIN,relative_humidity_3pm
0,91.2,77.48,40.3,49.09


In [3]:
def functionG(row):
    if row['relative_humidity_3pm'] < 25:
        val = 1
    elif row['relative_humidity_3pm'] > 25:
        val = 0
    else:
        val = 0
    return val

In [4]:
f['label']=f.apply(functionG, axis=1)

In [5]:
f.shape

(1064, 30)

In [6]:
#REMOVE HUMIDITY
cols = ['relative_humidity_DAYMAX','relative_humidity_DAYMIN']

In [7]:
for c in cols:
    del f[c]
f.shape

(1064, 28)

In [8]:
#REMOVE DAY features
cols = [c for c in f.columns if ('day' in c.lower())]
for c in cols:
    del f[c]
f.shape

(1064, 18)

In [9]:
#REMOVE 3pm features
cols = [c for c in f.columns if ('3pm' in c.lower())]
cols.remove('relative_humidity_3pm')

In [10]:
for c in cols:
    del f[c]
f.shape

(1064, 12)

In [11]:
f.columns

Index(['date', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm', 'label'],
      dtype='object')

In [12]:
# CONVERT wind from meter per sec to miles per hour

f['avg_wind_speed_9am'] = f['avg_wind_speed_9am'].apply(lambda x: x*2.23694)
f['max_wind_speed_9am'] = f['max_wind_speed_9am'].apply(lambda x: x*2.23694)

In [13]:
p = f.dropna()
f.shape, p.shape

((1064, 12), (1064, 12))

In [14]:
label1 = f.dropna().loc[f['label'] == 1]
label0 = f.dropna().loc[f['label'] == 0]
label0.shape, label1.shape

((869, 12), (195, 12))

In [15]:
label0 =label0.sample(547, random_state=42)

f=label0.append(label1)
f.reset_index(inplace=True)
del f['index']

f.shape

(742, 12)

In [16]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(ratio = 'auto', kind = 'regular', random_state=12)

lastcolumn = f.shape[1]

features = f.columns[1:lastcolumn-1]
target = f.columns[lastcolumn-1]

x=f[features].copy(deep=True)
y=f[target].copy(deep=True)

X, Y = smote.fit_sample(x, y)

In [17]:
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)
X.columns, Y.columns

(RangeIndex(start=0, stop=10, step=1), RangeIndex(start=0, stop=1, step=1))

In [18]:
X.columns = features
Y.columns = ['label'] 

In [19]:
X.columns, Y.columns

(Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
        'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
        'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
        'relative_humidity_3pm'],
       dtype='object'), Index(['label'], dtype='object'))

In [20]:
# Store a copy to write to CSV 
write_to_csv = X.copy(deep=True)
write_to_csv.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

# Classification

In [21]:
del X['relative_humidity_3pm']
del X['relative_humidity_9am']
X.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')

In [22]:
Y.shape, Y.sum()

((1094, 1), label    547
 dtype: int64)

In [23]:
########################
# RANDOM SPLIT
########################

from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.30, random_state=30)

In [24]:
Y.sum(), Y_test.sum(), Y_train.sum()

(label    547
 dtype: int64, label    172
 dtype: int64, label    375
 dtype: int64)

## Decision Tree Classifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, Y_train)
Z = clf.predict(X_test)

In [26]:
Z.sum(), Y_test.sum(), Y_test.shape[0]-Y_test.sum()

(162, label    172
 dtype: int64, label    157
 dtype: int64)

In [27]:
from sklearn.metrics import classification_report
# 1 means LOW humidity
print(classification_report(Y_test, Z))

             precision    recall  f1-score   support

          0       0.77      0.82      0.80       157
          1       0.83      0.78      0.80       172

avg / total       0.80      0.80      0.80       329



In [28]:
sklearn.metrics.accuracy_score(Y_test, Z, normalize = True)

0.79939209726443772

## Naive Bayes

In [29]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_classifier = gnb.fit(X_train, Y_train)
Z_NaiveBayes = gnb_classifier.predict(X_test)

  y = column_or_1d(y, warn=True)


In [30]:
# 1 means LOW humidity
print(classification_report(Y_test, Z_NaiveBayes))

             precision    recall  f1-score   support

          0       0.95      0.22      0.36       157
          1       0.58      0.99      0.73       172

avg / total       0.76      0.62      0.56       329



In [31]:
sklearn.metrics.accuracy_score(Y_test, Z_NaiveBayes, normalize = True)

0.62310030395136773

# Writing data to csv

In [32]:
df = write_to_csv.copy(deep=True)

#add extra row
df.loc[len(df)]= df.iloc[555]

df.shape

(1095, 10)

In [33]:
# shuffle
import numpy as np
df = df.iloc[np.random.permutation(len(df))]
df = df.reset_index(drop=True)

In [34]:
import random

ix = [(row, col) for row in range(df.shape[0]) for col in range(df.shape[1]-1)]

for row, col in random.sample(ix, int(31)):
    df.iat[row, col] = np.nan

In [35]:
df.to_csv("daily_weather.csv")