In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer


In [None]:
def load_average_vehicle(file_dir):
    average_vehicle_df = pd.read_excel(file_dir, sheet_name="2.1, 2.2, 2.3,2.4", header=2, nrows=19, usecols="A:AH")
    return average_vehicle_df

In [None]:
def load_light_fleet_age(file_dir):
    light_fleet_age_df = pd.read_excel(file_dir, sheet_name="2.10", header=1, nrows=7)
    return light_fleet_age_df

In [None]:
def load_co2_emission(file_dir):
    co2_emission_df = pd.read_excel(file_dir, sheet_name="1.10", header=2, nrows=17, usecols="A:E")
    return co2_emission_df

In [None]:
def get_sort_array_index(array):
    # print(np.sort(array))
    order = []
    for element in np.sort(array):
        for idx, pca_value in enumerate(array):
            if element == pca_value:
                order.append(idx)

    print(order)

In [None]:
def step_2_4_1_imputation_co2_emission_df(co2_emission_df):
    missing_values = [[2000] + [np.nan for i in range(4)], [2018] + [np.nan for i in range(4)]]

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(co2_emission_df)
    X = imp.transform(missing_values)

    df = pd.DataFrame(X, columns=co2_emission_df.columns)
    co2_emission_df = pd.concat([co2_emission_df, df])
    co2_emission_df = co2_emission_df.sort_values(by=['Year'])
    co2_emission_df.index = [x for x in range(len(co2_emission_df.index))]

    return co2_emission_df

In [None]:
def plot_normality(target, name):
    mean,std = np.mean(target), np.std(target)
    X = np.linspace(np.min(target), np.max(target), 1000)
    pdf = stats.norm.pdf(X, mean, std)
    plt.plot(X, pdf, label="PDF")
    plt.grid()
    plt.title('Check Normal Distribution for %s' %name,fontsize=10)
    plt.xlabel('x')
    plt.ylabel('Probability')
    plt.show()

In [None]:
def step_2_4_2_check_normality(number_fleets_df, light_fleet_age_df,co2_emission_df):
    plot_normality(number_fleets_df['Total LPV new'], 'Total LPV new')
    plot_normality(number_fleets_df[' Total LPV used'], 'Total LPV used')
    plot_normality(number_fleets_df['Total LCV new'], 'Total LCV new')
    plot_normality(number_fleets_df[' Total LCV used'], 'Total LCV used')

    plot_normality(np.array(light_fleet_age_df.iloc[0][1:20].astype(int)), '0-4 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[1][1:20].astype(int)), '5-9 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[2][1:20].astype(int)), '10-14 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[3][1:20].astype(int)), '15-19 age group')
    plot_normality(np.array(light_fleet_age_df.iloc[4][1:20].astype(int)), '20+ age group')


    plot_normality(co2_emission_df['Light passenger'], 'Light passenger co2 emssion')
    plot_normality(co2_emission_df['Light commercial'], 'Light commercial co2 emission')

In [None]:
def step_3_1_clean_light_age_distribution(light_fleet_age_df):
    light_fleet_age_df = light_fleet_age_df.T
    light_fleet_age_df.index = [x for x in range(len(light_fleet_age_df.index))]

    numbers = pd.DataFrame(light_fleet_age_df[1:20])
    numbers = numbers.drop(columns=[6])
    numbers.columns = ['0-4 years', '5-9 years', '10-14 years', '15-19 years', '20+ years', 'Total']
    numbers.index = [i for i in range(2000, 2019)]

    percentages = pd.DataFrame(light_fleet_age_df[20:])
    percentages = percentages.drop(columns=5)
    percentages.columns = ['0-4 years percentage', '5-9 years  percentage',
                           '10-14 years percentage', '15-19 years percentage',
                           '20+ years percentage', '15+ years percentage']
    percentages.index = [i for i in range(2000, 2019)]

    new_age_distribution = pd.concat([numbers, percentages], axis=1, join='inner')
    new_age_distribution.insert(0, 'Period', new_age_distribution.index)
    new_age_distribution.index = [i for i in range(len(new_age_distribution.index))]

    return new_age_distribution

In [None]:
def step_3_3_construct_new_distribution_df(nums_columns, percentage_columns, number_fleets_df, new_age_distribution_df):

    # new_table = {'Period':[i for i in range(2000,2019)]} for internal output
    new_table = {}
    for num_column in nums_columns:
        for percenate_column in percentage_columns:
            new_column = new_age_distribution_df[percenate_column] * number_fleets_df[num_column]
            new_column_name = '%s of %s' % (percenate_column[:-11].strip(), num_column[6:].strip())
            new_table[new_column_name] = new_column

    new_age_distribution_df = pd.DataFrame(new_table)

    return new_age_distribution_df

In [None]:
def step_3_5_convert_object_to_int(data_df):
    for column in data_df.columns:
        if data_df.dtypes[column] != np.float64:
            data_df[column] = data_df[column].astype(np.int64)

    return data_df