In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv("daily_weather.csv")

In [3]:
data.head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [4]:
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [5]:
del data['number']

In [6]:
data.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [7]:
before_rows=data.shape[0]
before_rows

1095

In [8]:
data = data.dropna()

In [9]:
after_rows = data.shape[0]

In [10]:
after_rows

1064

In [11]:
before_rows - after_rows

31

In [12]:
clean_data = data.copy()

In [13]:
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm']>24.99)*1

In [14]:
clean_data['high_humidity_label'].head()

0    1
1    0
2    0
3    0
4    1
Name: high_humidity_label, dtype: int64

In [15]:
clean_data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [16]:
y = clean_data[['relative_humidity_3pm']].copy()
y.head()

Unnamed: 0,relative_humidity_3pm
0,36.16
1,19.426597
2,14.46
3,12.742547
4,76.74


In [17]:
clean_data['relative_humidity_3pm'].head()

0    36.160000
1    19.426597
2    14.460000
3    12.742547
4    76.740000
Name: relative_humidity_3pm, dtype: float64

In [18]:
morning_features = ['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am', 'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am', 'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am']

In [19]:
x = clean_data[morning_features]

In [20]:
x.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41


In [21]:
y.columns

Index(['relative_humidity_3pm'], dtype='object')

In [22]:
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'],
      dtype='object')

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=324)

In [36]:
x_train.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am
841,918.37,72.932,184.5,2.013246,186.7,2.773806,0.0,0.0,8.81
75,920.1,53.492,186.1,13.444009,193.8,15.367778,0.0,0.0,31.47
95,927.61,54.896,55.0,4.988376,53.4,7.202947,0.0,0.0,13.51
895,919.235153,65.951112,194.343333,2.942019,216.569792,3.65881,0.0,0.0,14.073504
699,919.888128,68.687822,228.51773,3.960858,247.954028,5.185547,0.0,0.0,14.492839


In [25]:
x_test.shape[0]

352

In [37]:
y_train.head()

Unnamed: 0,relative_humidity_3pm
841,16.03
75,64.88
95,14.05
895,17.766646
699,20.986751


In [27]:
y_test.shape[0]

352

In [28]:
type(x_train)

pandas.core.frame.DataFrame

In [29]:
type(x_test)

pandas.core.frame.DataFrame

In [30]:
type(y_test)

pandas.core.frame.DataFrame

In [31]:
type(y_train)

pandas.core.frame.DataFrame

In [32]:
y_train.describe()

Unnamed: 0,relative_humidity_3pm
count,712.0
mean,35.106106
std,22.459052
min,5.3
25%,17.41858
50%,24.17
75%,51.9875
max,92.25


In [33]:
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
type(humidity_classifier)

sklearn.tree.tree.DecisionTreeClassifier

In [34]:
humidity_classifier

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [35]:
humidity_classifier.fit(x_train, y_train)

ValueError: Unknown label type: 'continuous'

In [62]:
predictions = humidity_classifier.predict(x_test)

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.