### 0. Load Dependencies

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### 1. Load Data

In [2]:
df = pd.read_csv('data/Traffic/Metro_Interstate_Traffic_Volume.csv')

# process the date_time column
df['date_time'] = pd.to_datetime(df['date_time'])
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['hour'] = df['date_time'].dt.hour
df.drop('date_time', axis=1, inplace=True)

# convert it to a classification problem
df.loc[df['traffic_volume'] < 3500, 'traffic_volume'] = 0
df.loc[df['traffic_volume'] >= 3500, 'traffic_volume'] = 1 

In [3]:
# Get one hot encoding of categorical columns
one_hot = pd.get_dummies(df[['holiday', 'weather_main']]).astype(int)
df = df.drop(['holiday', 'weather_main', 'weather_description'], axis=1) # Drop original categorical columns
df = df.join(one_hot) # Join the encoded df

In [4]:
df.head()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume,year,month,day,hour,holiday_Christmas Day,...,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Squall,weather_main_Thunderstorm
0,288.28,0.0,0.0,40,1,2012,10,2,9,0,...,1,0,0,0,0,0,0,0,0,0
1,289.36,0.0,0.0,75,1,2012,10,2,10,0,...,1,0,0,0,0,0,0,0,0,0
2,289.58,0.0,0.0,90,1,2012,10,2,11,0,...,1,0,0,0,0,0,0,0,0,0
3,290.13,0.0,0.0,90,1,2012,10,2,12,0,...,1,0,0,0,0,0,0,0,0,0
4,291.14,0.0,0.0,75,1,2012,10,2,13,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
year = df['year']
month = df['month']

In [6]:
year.value_counts() # Sanity check

year
2017    10605
2016     9306
2013     8573
2018     7949
2014     4839
2015     4373
2012     2559
Name: count, dtype: int64

In [7]:
month.value_counts() # Sanity check

month
7     4795
5     4436
8     4378
4     4259
12    4249
1     4006
9     3831
3     3793
6     3772
11    3686
2     3526
10    3473
Name: count, dtype: int64

In [8]:
may2013 = df[(year == 2013) & (month == 5)]
jun2013 = df[(year == 2013) & (month == 6)]
jul2013 = df[(year == 2013) & (month == 7)]
aug2013 = df[(year == 2013) & (month == 8)]
sep2013 = df[(year == 2013) & (month == 9)]
oct2013 = df[(year == 2013) & (month == 10)]
nov2013 = df[(year == 2013) & (month == 11)]
dec2013 = df[(year == 2013) & (month == 12)]

print(may2013.shape, jun2013.shape, jul2013.shape, 
      aug2013.shape, sep2013.shape, oct2013.shape, nov2013.shape)

(940, 31) (767, 31) (748, 31) (650, 31) (478, 31) (442, 31) (590, 31)


In [9]:
# Sanity check

may2013['traffic_volume'].value_counts()

traffic_volume
0    483
1    457
Name: count, dtype: int64

In [10]:
jun2013['traffic_volume'].value_counts()

traffic_volume
0    386
1    381
Name: count, dtype: int64

In [11]:
jul2013['traffic_volume'].value_counts()

traffic_volume
0    381
1    367
Name: count, dtype: int64

In [12]:
aug2013['traffic_volume'].value_counts()

traffic_volume
1    342
0    308
Name: count, dtype: int64

In [13]:
sep2013['traffic_volume'].value_counts()

traffic_volume
0    254
1    224
Name: count, dtype: int64

In [14]:
oct2013['traffic_volume'].value_counts()

traffic_volume
1    242
0    200
Name: count, dtype: int64

In [15]:
nov2013['traffic_volume'].value_counts()

traffic_volume
0    311
1    279
Name: count, dtype: int64

In [16]:
dec2013['traffic_volume'].value_counts()

traffic_volume
0    508
1    373
Name: count, dtype: int64

In [17]:
numerical_cols = ['temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour']
test_size = 0.3
n_samples = 20
seeds = list(range(n_samples))

In [18]:
# Sanity check
# len(y_may_trn[y_may_trn == 0]) / len(y_may_trn), len(y_may_val[y_may_val == 0]) / len(y_may_val)

In [19]:
for i, seed in enumerate(seeds):
    # May 2013
    X_may, y_may = may2013[may2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], may2013['traffic_volume']
    scaler = MinMaxScaler()
    X_may[numerical_cols] = scaler.fit_transform(X_may[numerical_cols])
    X_may = X_may.to_numpy()
    y_may = y_may.to_numpy()

    X_may_trn, X_may_val, y_may_trn, y_may_val = train_test_split(X_may, y_may, stratify=y_may, test_size=test_size,
                                                                  random_state=seed, shuffle=True)
    
    # June 2013
    X_jun, y_jun = jun2013[jun2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], jun2013['traffic_volume']
    scaler = MinMaxScaler()
    X_jun[numerical_cols] = scaler.fit_transform(X_jun[numerical_cols])
    X_jun = X_jun.to_numpy()
    y_jun = y_jun.to_numpy()
    X_jun_trn, X_jun_val, y_jun_trn, y_jun_val = train_test_split(X_jun, y_jun, stratify=y_jun, test_size=test_size,
                                                                  random_state=seed, shuffle=True)
    
    # July 2013
    X_jul, y_jul = jul2013[jul2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], jul2013['traffic_volume']
    scaler = MinMaxScaler()
    X_jul[numerical_cols] = scaler.fit_transform(X_jul[numerical_cols])
    X_jul = X_jul.to_numpy()
    y_jul = y_jul.to_numpy()
    X_jul_trn, X_jul_val, y_jul_trn, y_jul_val = train_test_split(X_jul, y_jul, stratify=y_jul, test_size=test_size,
                                                                  random_state=seed, shuffle=True)
    
    # August 2013
    X_aug, y_aug = aug2013[aug2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], aug2013['traffic_volume']
    scaler = MinMaxScaler()
    X_aug[numerical_cols] = scaler.fit_transform(X_aug[numerical_cols])
    X_aug = X_aug.to_numpy()
    y_aug = y_aug.to_numpy()
    X_aug_trn, X_aug_val, y_aug_trn, y_aug_val = train_test_split(X_aug, y_aug, stratify=y_aug, test_size=test_size,
                                                                  random_state=seed, shuffle=True)  
    
    
    # September 2013
    X_sep, y_sep = sep2013[sep2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], sep2013['traffic_volume']
    scaler = MinMaxScaler()
    X_sep[numerical_cols] = scaler.fit_transform(X_sep[numerical_cols])
    X_sep = X_sep.to_numpy()
    y_sep = y_sep.to_numpy()
    X_sep_trn, X_sep_val, y_sep_trn, y_sep_val = train_test_split(X_sep, y_sep, stratify=y_sep, test_size=test_size,
                                                                  random_state=seed, shuffle=True)
    
    
    # October 2013
    X_oct, y_oct = oct2013[oct2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], oct2013['traffic_volume']
    scaler = MinMaxScaler()
    X_oct[numerical_cols] = scaler.fit_transform(X_oct[numerical_cols])
    X_oct = X_oct.to_numpy()
    y_oct = y_oct.to_numpy()
    
    
    # November 2013
    X_nov, y_nov = nov2013[nov2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], nov2013['traffic_volume']
    scaler = MinMaxScaler()
    X_nov[numerical_cols] = scaler.fit_transform(X_nov[numerical_cols])
    X_nov = X_nov.to_numpy()
    y_nov = y_nov.to_numpy()
    
    
    # December 2013
    X_dec, y_dec = dec2013[dec2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], dec2013['traffic_volume']
    scaler = MinMaxScaler()
    X_dec[numerical_cols] = scaler.fit_transform(X_dec[numerical_cols])
    X_dec = X_dec.to_numpy()
    y_dec = y_dec.to_numpy()
    
    
    # Experiment 1
    X_may_jun_trn = np.concatenate((X_may_trn, X_jun_trn))
    y_may_jun_trn = np.concatenate((y_may_trn, y_jun_trn))
    X_may_jun_val = np.concatenate((X_may_val, X_jun_val))
    y_may_jun_val = np.concatenate((y_may_val, y_jun_val))

    X_jun_jul_trn = np.concatenate((X_jun_trn, X_jul_trn))
    y_jun_jul_trn = np.concatenate((y_jun_trn, y_jul_trn))
    X_jun_jul_val = np.concatenate((X_jun_val, X_jul_val))
    y_jun_jul_val = np.concatenate((y_jun_val, y_jul_val))

    X_jul_aug_trn = np.concatenate((X_jul_trn, X_aug_trn))
    y_jul_aug_trn = np.concatenate((y_jul_trn, y_aug_trn))
    X_jul_aug_val = np.concatenate((X_jul_val, X_aug_val))
    y_jul_aug_val = np.concatenate((y_jul_val, y_aug_val))

    # Experiment 2
    X_may_jun_jul_trn = np.concatenate((X_may_trn, X_jun_trn, X_jul_trn))
    y_may_jun_jul_trn = np.concatenate((y_may_trn, y_jun_trn, y_jul_trn))
    X_may_jun_jul_val = np.concatenate((X_may_val, X_jun_val, X_jul_val))
    y_may_jun_jul_val = np.concatenate((y_may_val, y_jun_val, y_jul_val))

    X_jun_jul_aug_trn = np.concatenate((X_jun_trn, X_jul_trn, X_aug_trn))
    y_jun_jul_aug_trn = np.concatenate((y_jun_trn, y_jul_trn, y_aug_trn))
    X_jun_jul_aug_val = np.concatenate((X_jun_val, X_jul_val, X_aug_val))
    y_jun_jul_aug_val = np.concatenate((y_jun_val, y_jul_val, y_aug_val))

    X_jul_aug_sep_trn = np.concatenate((X_jul_trn, X_aug_trn, X_sep_trn))
    y_jul_aug_sep_trn = np.concatenate((y_jul_trn, y_aug_trn, y_sep_trn))
    X_jul_aug_sep_val = np.concatenate((X_jul_val, X_aug_val, X_sep_val))
    y_jul_aug_sep_val = np.concatenate((y_jul_val, y_aug_val, y_sep_val))

    mydict = {'X_may_trn':X_may_trn, 'y_may_trn':y_may_trn, 'X_jun_trn':X_jun_trn, 'y_jun_trn':y_jun_trn,
              'X_jul_trn':X_jul_trn, 'y_jul_trn':y_jul_trn, 'X_aug_trn':X_aug_trn, 'y_aug_trn':y_aug_trn,
              'X_sep_trn': X_sep_trn, 'y_sep_trn': y_sep_trn,

              'X_may_jun_trn':X_may_jun_trn, 'y_may_jun_trn':y_may_jun_trn, 'X_may_jun_val':X_may_jun_val,
              'y_may_jun_val':y_may_jun_val,

              'X_jun_jul_trn':X_jun_jul_trn, 'y_jun_jul_trn':y_jun_jul_trn, 'X_jun_jul_val': X_jun_jul_val,
              'y_jun_jul_val': y_jun_jul_val,

              'X_jul_aug_trn':X_jul_aug_trn, 'y_jul_aug_trn':y_jul_aug_trn, 'X_jul_aug_val': X_jul_aug_val,
              'y_jul_aug_val': y_jul_aug_val,

              'X_may_jun_jul_trn':X_may_jun_jul_trn,'y_may_jun_jul_trn':y_may_jun_jul_trn,
              'X_may_jun_jul_val':X_may_jun_jul_val, 'y_may_jun_jul_val':y_may_jun_jul_val,

              'X_jun_jul_aug_trn':X_jun_jul_aug_trn, 'y_jun_jul_aug_trn':y_jun_jul_aug_trn,
              'X_jun_jul_aug_val': X_jun_jul_aug_val, 'y_jun_jul_aug_val': y_jun_jul_aug_val,

              'X_jul_aug_sep_trn': X_jul_aug_sep_trn, 'y_jul_aug_sep_trn': y_jul_aug_sep_trn,
              'X_jul_aug_sep_val': X_jul_aug_sep_val, 'y_jul_aug_sep_val': y_jul_aug_sep_val,
  
              'X_jul':X_jul,'y_jul':y_jul, 'X_aug':X_aug, 'y_aug':y_aug, 'X_sep':X_sep, 'y_sep':y_sep,
              'X_oct':X_oct, 'y_oct':y_oct, 'X_nov':X_nov, 'y_nov':y_nov, 'X_dec':X_dec, 'y_dec':y_dec}


    with open("data/traffic/traffic_volume_split{}.pickle".format(i+1), "wb") as fp:  # Pickling
        pickle.dump(mydict, fp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_may[numerical_cols] = scaler.fit_transform(X_may[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_jun[numerical_cols] = scaler.fit_transform(X_jun[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_jul[numerical_cols] = scaler.fit_transform(X_jul[numerical_cols])
A

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_may[numerical_cols] = scaler.fit_transform(X_may[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_jun[numerical_cols] = scaler.fit_transform(X_jun[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_jul[numerical_cols] = scaler.fit_transform(X_jul[numerical_cols])
A

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_oct[numerical_cols] = scaler.fit_transform(X_oct[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_nov[numerical_cols] = scaler.fit_transform(X_nov[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_dec[numerical_cols] = scaler.fit_transform(X_dec[numerical_cols])
A

In [20]:
# Sanity check

# X_may, y_may = may2013[may2013.columns.drop(['traffic_volume', 'year', 'month', 'day'])], may2013['traffic_volume']
# X_may.columns

In [21]:
# X_may['weather_main_Clouds'].value_counts()

In [22]:
# Sanity check: logistic regression, decision tree, and KNN

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score

for i in range(n_samples):
    
    print('Split', i+1)
    print(' ')
    with open("data/traffic/traffic_volume_split{}.pickle".format(i+1), "rb") as fp:  # Pickling
        mydict = pickle.load(fp)
    
    X_jul_trn, y_jul_trn = mydict['X_jul_trn'], mydict['y_jul_trn']
    X_jul_aug_trn, y_jul_aug_trn = mydict['X_jul_aug_trn'], mydict['y_jul_aug_trn']
    X_jul_aug_val, y_jul_aug_val = mydict['X_jul_aug_val'], mydict['y_jul_aug_val']
    
    model_lr = LogisticRegression(solver='liblinear', max_iter=5000, random_state=666)
    model_lr.fit(X_jul_trn, y_jul_trn)
    preds_before = model_lr.predict(X_jul_aug_val)
    
    model_lr = LogisticRegression(solver='liblinear', max_iter=5000, random_state=666)
    model_lr.fit(X_jul_aug_trn, y_jul_aug_trn)
    preds_after = model_lr.predict(X_jul_aug_val)
    
    print('Logistic Regression...')
    print('Before adding August data:', accuracy_score(y_jul_aug_val, preds_before), 
          'After adding August data:', accuracy_score(y_jul_aug_val, preds_after))
    
    
    model_tree = DecisionTreeClassifier(random_state=666)
    model_tree.fit(X_jul_trn, y_jul_trn)
    preds_before = model_tree.predict(X_jul_aug_val)
    
    model_tree = DecisionTreeClassifier(random_state=666)
    model_tree.fit(X_jul_aug_trn, y_jul_aug_trn)
    preds_after = model_tree.predict(X_jul_aug_val)
    
    print('Decision Tree...')
    print('Before adding August data:', accuracy_score(y_jul_aug_val, preds_before), 
          'After adding August data:', accuracy_score(y_jul_aug_val, preds_after))
    
    
    model_knn = KNN(n_neighbors=3)
    model_knn.fit(X_jul_trn, y_jul_trn)
    preds_before = model_knn.predict(X_jul_aug_val)
    
    model_knn = KNN(n_neighbors=3)
    model_knn.fit(X_jul_aug_trn, y_jul_aug_trn)
    preds_after = model_knn.predict(X_jul_aug_val)
    
    print('KNN...')
    print('Before adding August data:', accuracy_score(y_jul_aug_val, preds_before), 
          'After adding August data:', accuracy_score(y_jul_aug_val, preds_after))
    
    print(' ')
    print(' ')

Split 1
 
Logistic Regression...
Before adding August data: 0.6309523809523809 After adding August data: 0.6452380952380953
Decision Tree...
Before adding August data: 0.8809523809523809 After adding August data: 0.8928571428571429
KNN...
Before adding August data: 0.8642857142857143 After adding August data: 0.8714285714285714
 
 
Split 2
 
Logistic Regression...
Before adding August data: 0.6190476190476191 After adding August data: 0.6666666666666666
Decision Tree...
Before adding August data: 0.9047619047619048 After adding August data: 0.9095238095238095
KNN...
Before adding August data: 0.8452380952380952 After adding August data: 0.8833333333333333
 
 
Split 3
 
Logistic Regression...
Before adding August data: 0.6261904761904762 After adding August data: 0.6238095238095238
Decision Tree...
Before adding August data: 0.8833333333333333 After adding August data: 0.8857142857142857
KNN...
Before adding August data: 0.8404761904761905 After adding August data: 0.861904761904762
 
 