In [1]:
import seaborn as sns
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,LinearRegression,ElasticNet,Lasso
from yellowbrick.regressor import prediction_error,residuals_plot


In [None]:
split_2017_LPV_rows = [[1.59009400e+06, 1.61762700e+06, 1.43115413e+01, 2.84522000e+05, 2.54697000e+05,
                        4.53646000e+05, 2.51078000e+05, 3.46148000e+05, 2.89449000e+05, 2.59108000e+05,
                        4.61501000e+05, 2.55426000e+05, 3.52141000e+05],
                       [284522.0, 0.0, 2.975101564, 284522.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [254697.0, 0.0, 7.975101564, 0.0, 254697.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [453646.0, 0.0, 12.97510156, 0.0, 0.0, 453646.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [251078.0, 0.0, 17.97510156, 0.0, 0.0, 0.0, 251078.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [346148.0, 0.0, 22.97510156, 0.0, 0.0, 0.0, 0.0, 346148.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [0.0, 289449.0, 9.130000000, 0.0, 0.0, 0.0, 0.0, 0.0, 289449.0, 0.0, 0.0, 0.0, 0.0],
                       [0.0, 259108.0, 14.13000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 259108.0, 0.0, 0.0, 0.0],
                       [0.0, 461501.0, 19.13000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 461501.0, 0.0, 0.0],
                       [0.0, 255426.0, 24.13000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 255426.0, 0.0],
                       [0.0, 352141.0, 29.13000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 352141.0], ]

split_2017_LCV_rows = [[4.76173000e+05, 1.06367000e+05, 1.25145441e+01, 8.52030000e+04, 7.62720000e+04,
                        1.35849000e+05, 7.51880000e+04, 1.03658000e+05, 1.90320000e+04, 1.70370000e+04,
                        3.03460000e+04, 1.67950000e+04, 2.31550000e+04],
                       [085203.0, 0.0, 2.868356783, 085203.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [076272.0, 0.0, 7.868356783, 0.0, 076272.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [135849.0, 0.0, 12.86835678, 0.0, 0.0, 135849.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [075188.0, 0.0, 17.86835678, 0.0, 0.0, 0.0, 075188.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [103658.0, 0.0, 22.86835678, 0.0, 0.0, 0.0, 0.0, 103658.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                       [0.0, 019032.0, 07.98000000, 0.0, 0.0, 0.0, 0.0, 0.0, 019032.0, 0.0, 0.0, 0.0, 0.0],
                       [0.0, 017037.0, 12.98000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 017037.0, 0.0, 0.0, 0.0],
                       [0.0, 030346.0, 17.98000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 030346.0, 0.0, 0.0],
                       [0.0, 016795.0, 22.98000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 016795.0, 0.0],
                       [0.0, 023155.0, 27.98000000, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 023155.0], ]

In [2]:
def load_average_vehicle(file_dir):
    average_vehicle_df = pd.read_excel(file_dir, sheet_name="2.1, 2.2, 2.3,2.4", header=2, nrows=19, usecols="A:AH")
    return average_vehicle_df

In [3]:
def load_light_fleet_age(file_dir):
    light_fleet_age_df = pd.read_excel(file_dir, sheet_name="2.10", header=1, nrows=7)
    return light_fleet_age_df

In [4]:
def load_co2_emission(file_dir):
    co2_emission_df = pd.read_excel(file_dir, sheet_name="1.10", header=2, nrows=17, usecols="A:E")
    return co2_emission_df

In [5]:
def get_sort_array_index(array):
    print(np.sort(array))
    order = []
    for element in np.sort(array):
        for idx, pca_value in enumerate(array):
            if element == pca_value:
                order.append(idx)

    print(order)

In [None]:
def exploration(column,column_name):
    column = np.array(column)
    min, max = np.nanmin(column), np.nanmax(column)
    variance, std = np.nanvar(column), np.nanstd(column)
    print('column: %s, range: [%.2f, %.2f] variance: %.2f +/- %.2f'%(column_name, min,max,variance, std) )

In [6]:
def step_2_4_1_imputation_co2_emission_df(co2_emission_df):
    missing_values = [[2000] + [np.nan for i in range(4)], [2018] + [np.nan for i in range(4)]]

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(co2_emission_df)
    X = imp.transform(missing_values)

    df = pd.DataFrame(X, columns=co2_emission_df.columns)
    co2_emission_df = pd.concat([co2_emission_df, df])
    co2_emission_df = co2_emission_df.sort_values(by=['Year'])
    co2_emission_df.index = [x for x in range(len(co2_emission_df.index))]

    return co2_emission_df

In [7]:
def plot_normality(target, name):
    mean,std = np.mean(target), np.std(target)
    X = np.linspace(np.min(target), np.max(target), 1000)
    pdf = stats.norm.pdf(X, mean, std)
    plt.plot(X, pdf, label="PDF")
    plt.grid()
    plt.title('Check Normal Distribution for %s' %name,fontsize=10)
    plt.xlabel('x')
    plt.ylabel('Probability')
    plt.show()

In [8]:
def step_2_4_2_check_normality(number_fleets_df, light_fleet_age_df,co2_emission_df):
    plot_normality(number_fleets_df['Total LPV new'], 'Total LPV new')
    plot_normality(number_fleets_df[' Total LPV used'], 'Total LPV used')
    plot_normality(number_fleets_df['Total LCV new'], 'Total LCV new')
    plot_normality(number_fleets_df[' Total LCV used'], 'Total LCV used')

    plot_normality(np.array(light_fleet_age_df.iloc[0][1:20].astype(int)), '0-4 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[1][1:20].astype(int)), '5-9 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[2][1:20].astype(int)), '10-14 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[3][1:20].astype(int)), '15-19 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[4][1:20].astype(int)), '20+ age group')


    plot_normality(co2_emission_df['Light passenger'], 'Light passenger co2 emssion')
    plot_normality(co2_emission_df['Light commercial'], 'Light commercial co2 emission')

In [9]:
def step_3_1_clean_light_age_distribution(light_fleet_age_df):
    light_fleet_age_df = light_fleet_age_df.T
    light_fleet_age_df.index = [x for x in range(len(light_fleet_age_df.index))]

    numbers = pd.DataFrame(light_fleet_age_df[1:20])
    numbers = numbers.drop(columns=[6])
    numbers.columns = ['0-4 years', '5-9 years', '10-14 years', '15-19 years', '20+ years', 'Total']
    numbers.index = [i for i in range(2000, 2019)]

    percentages = pd.DataFrame(light_fleet_age_df[20:])
    percentages = percentages.drop(columns=5)
    percentages.columns = ['0-4 years percentage', '5-9 years  percentage',
                           '10-14 years percentage', '15-19 years percentage',
                           '20+ years percentage', '15+ years percentage']
    percentages.index = [i for i in range(2000, 2019)]

    new_age_distribution = pd.concat([numbers, percentages], axis=1, join='inner')
    new_age_distribution.insert(0, 'Period', new_age_distribution.index)
    new_age_distribution.index = [i for i in range(len(new_age_distribution.index))]

    return new_age_distribution

In [10]:
def step_3_3_construct_new_distribution_df(nums_columns, percentage_columns, number_fleets_df, new_age_distribution_df):

    # new_table = {'Period':[i for i in range(2000,2019)]} for internal output
    new_table = {}
    for num_column in nums_columns:
        for percenate_column in percentage_columns:
            new_column = new_age_distribution_df[percenate_column] * number_fleets_df[num_column]
            new_column_name = '%s of %s' % (percenate_column[:-11].strip(), num_column[6:].strip())
            new_table[new_column_name] = new_column

    new_age_distribution_df = pd.DataFrame(new_table)

    return new_age_distribution_df

In [11]:
def step_3_5_convert_object_to_int(data_df):
    for column in data_df.columns:
        if data_df.dtypes[column] != np.float64:
            data_df[column] = data_df[column].astype(np.int64)

    return data_df

In [None]:
def step_4_1_LPV_cols(data_df, printed=True):
    default_LPV_cols = ['Period', 'Total LPV new',' Total LPV used', 'Light passenger average age',
                        '0-4 years of LPV new', '5-9 years of LPV new', '10-14 years of LPV new',
                        '15-19 years of LPV new', '20+ years of LPV new', '15+ years of LPV new',
                        '0-4 years of LPV used', '5-9 years of LPV used', '10-14 years of LPV used',
                        '15-19 years of LPV used', '20+ years of LPV used', '15+ years of LPV used', ]

    LPV = data_df[default_LPV_cols]
    pca = PCA(n_components=len(LPV.columns))
    pca.fit(LPV)
    pca_values =pca.explained_variance_ratio_
    if printed:
        print(pca_values)
        print(pca.components_[0:2])
        print(pca_values[0] + pca_values[1])
        get_sort_array_index(pca.components_[0])
        get_sort_array_index(pca.components_[1])

    reduced_LPV_cols = ['Total LPV new', ' Total LPV used', 'Light passenger average age',
                '0-4 years of LPV new', '5-9 years of LPV new', '10-14 years of LPV new',
                '15-19 years of LPV new', '20+ years of LPV new',
                '0-4 years of LPV used', '5-9 years of LPV used', '10-14 years of LPV used',
                '15-19 years of LPV used', '20+ years of LPV used', 'Light passenger']

    return reduced_LPV_cols

In [13]:
def step_4_1_LCV_cols(data_df,printed=True):
    default_LCV_cols = ['Period',  'Total LCV new', ' Total LCV used', 'Light commercial average age',
                        '0-4 years of LCV new', '5-9 years of LCV new', '10-14 years of LCV new',
                        '15-19 years of LCV new', '20+ years of LCV new', '15+ years of LPV new',
                        '0-4 years of LCV used', '5-9 years of LCV used', '10-14 years of LCV used',
                        '15-19 years of LCV used', '20+ years of LCV used', '15+ years of LPV used', ]


    LCV = data_df[ default_LCV_cols]

    pca = PCA(n_components=len(LCV.columns))
    pca.fit(LCV)
    pca_values =pca.explained_variance_ratio_
    if printed:
        print(pca_values)
        print(pca.components_[0])
        print(pca_values[0])
        get_sort_array_index(pca.components_[0])

    reduced_LCV_cols = ['Total LCV new', ' Total LCV used', 'Light commercial average age',
                        '0-4 years of LCV new', '5-9 years of LCV new', '10-14 years of LCV new',
                        '15-19 years of LCV new', '20+ years of LCV new',
                        '0-4 years of LCV used', '5-9 years of LCV used', '10-14 years of LCV used',
                        '15-19 years of LCV used', '20+ years of LCV used', 'Light commercial']
    return reduced_LCV_cols

In [14]:
def step_4_2_normalization(data_df,LPV_cols,LCV_cols):
    LPV_df, LCV_df = data_df[LPV_cols],data_df[LCV_cols]
    LPV_data,LCV_data = normalize( LPV_df, axis=1, norm='l2'),normalize(LCV_df, axis=1, norm='l2')
    LPV_df, LCV_df = pd.DataFrame(LPV_data, columns= LPV_cols),pd.DataFrame(LCV_data, columns= LCV_cols)
    return LPV_df, LCV_df

In [None]:
def step_5_split_data_add_noisy_data(data_df, test_size):
    noisy_x_row = pd.DataFrame({x: [0.0] for x in data_df.columns[:-1]})
    second_last_row = data_df.iloc[-2]

    X_train, X_test, y_train, y_test = train_test_split(data_df[data_df.columns[:-1]], data_df[data_df.columns[-1]], test_size=test_size)

    X_train, X_test, y_train, y_test =  pd.concat([X_train, noisy_x_row]),pd.concat([X_test, noisy_x_row]), \
                                        y_train.append(pd.Series([0.0])),y_test.append(pd.Series([0.0]))

    X_train, y_train = X_train.append(pd.Series(second_last_row[:-1])), y_train.append(pd.Series(second_last_row[-1]))


    return X_train, X_test, y_train, y_test

In [None]:
def linear_regressor(X_train,y_train, X_test,y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    
    return 'r2 score = %f,' %(regressor.score(X_test, y_test) )

In [None]:
def ridge_regressor(X_train,y_train, X_test,y_test, solver):
    regressor = Ridge(solver=solver)
    regressor.fit(X_train, y_train)
    
    return regressor.predict(X_test), ('r2 score = %f,' %(regressor.score(X_test, y_test)) )

In [None]:
def step_6_select_alogorithms(X_train,X_test,y_train,y_test):
    regressors = [LinearRegression(),Ridge(solver='sag'), Ridge(solver='saga'),Lasso(),ElasticNet()]
    regressor_name = ['Ordinary Least Square Regression', 'Ridge with sag solver', 'Ridge with saga solver',
                     'Lasso Regression', 'Elastic Net Regression']
    
    for idx,regressor in enumerate(regressors):
        regressor.fit(X_train,y_train)
        print('%s: r2 score = %f,' %(regressor_name[idx] ,regressor.score(X_test, y_test)) )

In [None]:
def step_6_select_ridge(X_train,X_test,y_train,y_test):
    regressors = [Ridge(solver='sag'), Ridge(solver='saga')]
    regressor_name = [ 'Ridge with sag solver', 'Ridge with saga solver']
    for idx,regressor in enumerate(regressors):
        regressor.fit(X_train,y_train)
        print('%s: r2 score = %f,' %(regressor_name[idx] ,regressor.score(X_test, y_test)) )
        print( np.subtract(regressor.predict(X_test),y_test.values) )

In [None]:
def step_7_1_prepare(rows, columns):

    test_obj_2 = {x:[] for _,x in enumerate(columns)}
    for row in rows:
        for idx, x in enumerate(columns):
            test_obj_2[x].append(row[idx])

    return pd.DataFrame(test_obj_2)

In [None]:
def step_8_2_visualizaiton(regressor, X_train, y_train, X_test, y_test):
    predict_visualizer = prediction_error(regressor, X_train, y_train, X_test, y_test)
    residual_viz = residuals_plot(regressor, X_train, y_train, X_test, y_test)
    return predict_visualizer,residual_viz

In [None]:
def step_8_2_percentage_viz(percentage_list, columns):
    fig = plt.figure(figsize=(9, 5.0625))
    ax1 = fig.add_subplot(121)

    ratios = percentage_list
    labels = columns
    # rotate so that first wedge is split by the x-axis
    angle = -180 * ratios[0]
    ax1.pie(ratios, autopct='%1.1f%%', startangle=angle, labels=labels, )
    plt.show()