In [None]:
#!pip install pycountry_convert
#!pip uninstall scikit-learn -y
#!pip install scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/f3/74/eb899f41d55f957e2591cde5528e75871f817d9fb46d4732423ecaca736d/scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3MB)
[K     |████████████████████████████████| 22.3MB 1.8MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-0.24.1 threadpoolctl-2.1.0


In [None]:

import pandas as pd
import numpy as np
import io
suic_df = pd.read_csv("master.csv")
bike_df = pd.read_csv("day.csv")
vid_df = pd.read_csv("transcoding_mesurment.tsv", delimiter='\t')

from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler


##### Suicide Dataset #####
def country_to_continent(country_name):
    try:
        item = country_alpha2_to_continent_code(country_name_to_country_alpha2(country_name))
        
    #Handle countries not in pycountry
    except KeyError:
        if (country_name == "Republic of Korea"):
            item = 'AS'
        elif (country_name == "Saint Vincent and Grenadines"):
            item = 'SA'
        else:
            print(country_name)
    return item


suic_df_preprocessed = suic_df.copy()

#Convert countries to a continent code
suic_df_preprocessed["country"] = suic_df_preprocessed["country"].apply(lambda x: country_to_continent(x))

#Remove country-year feature. Also remove target variables
suic_df_preprocessed = suic_df_preprocessed.drop(['country-year', 'suicides/100k pop', 'suicides_no'], axis=1)

#Necessary because values with commas can't be converted to floats
suic_df_preprocessed[' gdp_for_year ($) ']=suic_df_preprocessed[' gdp_for_year ($) '].str.replace(',','')

preprocess_suic = make_column_transformer(
    (StandardScaler(),['year', 'population', 'HDI for year', ' gdp_for_year ($) ', 'gdp_per_capita ($)']),
    (OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'), ['country', 'sex', 'age', 'generation']),
    remainder='passthrough'
)

suic_df_preprocessed = preprocess_suic.fit_transform(suic_df_preprocessed)
#print(suic_df_preprocessed)



##### Bike Dataset #####
bike_df_preprocessed = bike_df.copy()

#Remove seemingly useless columns and target variables
#also remove atemp, since it's so similar to temp and does not reveal any new information (correlation with temp is 1:1)
#Should we one-hot the season and month data columns? They are currently numerical, but that might not make the most 
#sense

bike_df_preprocessed = bike_df_preprocessed.drop(['instant', 'dteday', 'cnt', 'casual', 'registered', 'atemp'], axis=1)

preprocess_bike = make_column_transformer(
    (StandardScaler(), bike_df_preprocessed.columns.drop(['season', 'mnth'])),
    (OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'), ['season', 'mnth']),
    remainder='passthrough'
)

bike_df_preprocessed = preprocess_bike.fit_transform(bike_df_preprocessed)
#print(bike_df_preprocessed)



##### Video Dataset #####

vid_df_preprocessed = vid_df.copy()

#Remove ID column. Carries no important information. Also remove target
vid_df_preprocessed = vid_df_preprocessed.drop(['id', 'utime', 'umem'], axis=1)

#Preprocess all but utime
preprocess_vid = make_column_transformer(
    (StandardScaler(), vid_df_preprocessed.columns.drop(['codec', 'o_codec'])),
    (OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'), ['codec', 'o_codec']),
    remainder='passthrough'
)

vid_df_preprocessed = preprocess_vid.fit_transform(vid_df_preprocessed)

from sklearn.feature_selection import mutual_info_regression, f_regression, SelectKBest


#Currently using ~3/4 of all features from mutual_info_regression - I have no idea how many we should be using

fraction_of_total = 1
num_feat_suic = int(np.floor(fraction_of_total * suic_df_preprocessed.shape[1]))
score_func_suic = mutual_info_regression
#score_func_suic = f_regression

num_feat_bike = int(np.floor(fraction_of_total * bike_df_preprocessed.shape[1]))
score_func_bike = mutual_info_regression
#score_func_bike = f_regression

num_feat_vid = int(np.floor(fraction_of_total * vid_df_preprocessed.shape[1]))
score_func_vid = mutual_info_regression
#score_func_vid = f_regression


fs_suic = SelectKBest(score_func=score_func_suic, k=num_feat_suic)
fs_bike = SelectKBest(score_func=score_func_bike, k=num_feat_bike)
fs_vid = SelectKBest(score_func=score_func_vid, k=num_feat_vid)

labels_suic = suic_df['suicides/100k pop']
labels_bike = bike_df['cnt']
labels_vid = vid_df['utime']

#Set nan to 0. Can change later
df_nans = np.isnan(suic_df_preprocessed)
suic_df_preprocessed[df_nans] = 0

#Pick top k features
suic_data_ready = fs_suic.fit_transform(suic_df_preprocessed, labels_suic)
bike_data_ready = fs_bike.fit_transform(bike_df_preprocessed, labels_bike)
vid_data_ready = fs_vid.fit_transform(vid_df_preprocessed, labels_vid)

In [None]:
#From project 1
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs 
            
    def fit_LGB(self, X, y, cv=3, n_jobs=3, feature_name=None, categorical_feature=None, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y, feature_name=feature_name, categorical_feature=categorical_feature, verbose=verbose)
            self.grid_searches[key] = gs  
    
    def fit_Cat(self, X, y, cv=3, n_jobs=3, cat_features=None, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y, cat_features=cat_features, verbose=verbose)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params, mean_train_score):
            
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
                 'mean_train_score': mean_train_score
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            mean_train_scores = self.grid_searches[k].cv_results_['mean_train_score']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s, t in zip(params,all_scores,mean_train_scores):
                rows.append((row(k, s, p, t)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score', 'mean_train_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    

In [None]:
poly_features_suic = PolynomialFeatures(degree = 2)
suic_poly = poly_features_suic.fit_transform(suic_data_ready)
poly_features_bike = PolynomialFeatures(degree = 2)
bike_poly = poly_features_bike.fit_transform(bike_data_ready)
poly_features_vid = PolynomialFeatures(degree = 2)
vid_poly = poly_features_vid.fit_transform(vid_data_ready)
from sklearn.feature_selection import f_regression, mutual_info_regression

#fs_suic = SelectKBest(score_func=f_regression, k=3)
#fs_bike = SelectKBest(score_func=f_regression, k=3)
#fs_vid = SelectKBest(score_func=f_regression, k=3)

#suic_data_ready = fs_suic.fit_transform(suic_poly, labels_suic)
#bike_data_ready = fs_bike.fit_transform(bike_df_preprocessed, labels_bike)
#vid_data_ready = fs_vid.fit_transform(vid_df_preprocessed, labels_vid)
f_bike, p_bike = f_regression(bike_poly, labels_bike)
f_suic, p_suic = f_regression(suic_poly,labels_suic)
f_vid, p_vid = f_regression(vid_poly, labels_vid)

mi_bike=mutual_info_regression(bike_poly, labels_bike)
mi_suic=mutual_info_regression(suic_poly, labels_suic)
mi_vid=mutual_info_regression(vid_poly, labels_vid)

n=3

top_n_bike_index = np.argsort(f_bike)[-1*n:]
top_n_suic_index = np.argsort(f_suic)[-1*n:]
top_n_vid_index = np.argsort(f_vid)[-1*n:]

top_n_bike_f = []
for i in top_n_bike_index:
  top_n_bike_f.append(poly_features_bike.get_feature_names()[i])

top_n_suic_f = []
for i in top_n_suic_index:
  top_n_suic_f.append(poly_features_suic.get_feature_names()[i])

top_n_vid_f = []
for i in top_n_vid_index:
  top_n_vid_f.append(poly_features_vid.get_feature_names()[i])

top_n_bike_index = np.argsort(mi_bike)[-1*n:]
top_n_suic_index = np.argsort(mi_suic)[-1*n:]
top_n_vid_index = np.argsort(mi_vid)[-1*n:]

top_n_bike_mi = []
for i in top_n_bike_index:
  top_n_bike_mi.append(poly_features_bike.get_feature_names()[i])

top_n_suic_mi = []
for i in top_n_suic_index:
  top_n_suic_mi.append(poly_features_suic.get_feature_names()[i])

top_n_vid_mi = []
for i in top_n_vid_index:
  top_n_vid_mi.append(poly_features_vid.get_feature_names()[i])

print(top_n_bike_f)
print(top_n_bike_mi)
print(top_n_suic_f)
print(top_n_suic_mi)
print(top_n_vid_f)
print(top_n_vid_mi)



  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


['x10 x23', 'x10 x21', '1']
['x0 x5', 'x5', 'x1 x5']
['x6 x9', 'x14 x17', '1']
['x1^2', 'x1 x11', 'x1']
['x17 x18', 'x17 x20', '1']
['x9 x15', 'x3 x16', 'x3 x15']


In [None]:
from copy import copy

bike_cols = copy(list(bike_df.columns))
#we dropped 'instant', 'dteday', 'cnt', 'casual', 'registered', 'atemp'
for i in ['instant', 'dteday', 'cnt', 'casual', 'registered', 'atemp']:
  bike_cols.remove(i)


suic_cols = copy(list(suic_df.columns))
#we dropped 'country-year', 'suicides/100k pop', 'suicides_no'
for i in ['country-year', 'suicides/100k pop', 'suicides_no']:
  suic_cols.remove(i)

vid_cols = copy(list(vid_df.columns))
#we dropped 'id', 'utime', 'umem'
for i in ['id', 'utime', 'umem']:
  vid_cols.remove(i)

print(bike_cols)
print(suic_cols)
print(vid_cols)


['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'hum', 'windspeed']
['country', 'year', 'sex', 'age', 'population', 'HDI for year', ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation']
['duration', 'codec', 'width', 'height', 'bitrate', 'framerate', 'i', 'p', 'b', 'frames', 'i_size', 'p_size', 'b_size', 'size', 'o_codec', 'o_bitrate', 'o_framerate', 'o_width', 'o_height']


In [None]:
#Q 15
l1_classifier = Lasso(max_iter=10000)
l2_classifier = Ridge(max_iter=10000)

params = {
    'alpha': [0.1, 0.01,.001]
}

models_in = {
    'Lasso': l1_classifier,
    'Ridge': l2_classifier
}

params_in = {
    'Lasso': params,
    'Ridge': params,
}

poly_features = PolynomialFeatures(degree = 2)
suic_poly = poly_features.fit_transform(bike_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(suic_poly, labels_suic, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 3)
suic_poly = poly_features.fit_transform(suic_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(suic_poly, labels_suic, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 4)
suic_poly = poly_features.fit_transform(suic_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(suic_poly, labels_suic, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 2)
bike_poly = poly_features.fit_transform(bike_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(bike_poly, labels_bike, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 3)
bike_poly = poly_features.fit_transform(bike_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(bike_poly, labels_bike, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 4)
bike_poly = poly_features.fit_transform(bike_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(bike_poly, labels_bike, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 2)
vid_poly = poly_features.fit_transform(vid_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(vid_poly, labels_vid, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 3)
vid_poly = poly_features.fit_transform(vid_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(vid_poly, labels_vid, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')

In [None]:
poly_features = PolynomialFeatures(degree = 4)
vid_poly = poly_features.fit_transform(vid_data_ready)
grid_search = EstimatorSelectionHelper(models_in, params_in)
grid_search.fit(vid_poly, labels_vid, n_jobs=-1, cv=5,scoring='neg_root_mean_squared_error')
grid_search.score_summary(sort_by='mean_score')