import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import numpy 
from sklearn.metrics import r2_score

class DistributionVehicles:
    
    def __init__(self):
        ''' Predicts distribution of vehicles given just All_Vehicles '''
        self.__df = pd.read_csv('missingData.csv')
        self.__testData = pd.read_csv('vehicleFixed.csv')
        self.__otherCols = self.__testData[['Date','Hour']]
        self.__testData = self.__testData['All_Vehicles2'].to_numpy().reshape(-1, 1)
        #print(self.__df.head)
        self.splitData()
    
    def splitData(self):
        ''' Splits dataset into training and test sets '''
        self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split(self.__df[['All_Vehicles']], self.__df[['Cars_And_Taxis','Motorbikes','Buses_and_Coaches','LGVs','HGVs']], test_size=0.2, random_state=42)
        
    def selectModel(self,model):
        ''' Select Model '''
        self.__model = model
        self.trainModel()
        self.predict()
    
    def trainModel(self):
        ''' Train Model ''' 
        self.__model.fit(self.__X_train, self.__y_train)
        
    def predict(self):
        ''' Makes predictions using the trained model '''
        predictions = self.__model.predict(self.__testData)
        predictions = pd.DataFrame(predictions, columns=['Cars_And_Taxis','Motorbikes','Buses_and_Coaches','LGVs','HGVs']).round().abs()
        #print(predictions.head)
        predictions = pd.concat([pd.DataFrame(self.__otherCols),predictions,pd.DataFrame(self.__testData)],axis=1)
        predictions = predictions.rename(columns={predictions.columns[-1]: 'All_Vehicles'})
        #predictions['Borough'] = self.__testData['Borough']
        #print(predictions)
        self.write2File(predictions)
        
    def evaluate(self):
        ''' Evaluate the performance of the selected model '''
        r2 = r2_score(self.__y_test, self.__model.predict(self.__X_test))
        print("R-squared value:", r2)
        
    def write2File(self,dataframe):
        dataframe.to_csv('vehicleFixed2.csv', index=False)
    
class ProcessData:
    
    def __init__(self):
        self.__df = pd.read_csv('baseline.csv')
        
    def formatTime(self):
        ''' Changes time from 7 -> 07:00:00 etc '''
        self.__df['Starthour'] = self.__df['Starthour'].astype(str).str.zfill(2)
        self.__df['Hour'] = self.__df['Starthour'] + ':00:00'
        # print(self.__df)
        
    def remTime(self):
        ''' Excludes all times not between 7am and 6pm '''
        # Remove starthour
        self.__df.drop('Starthour', axis=1, inplace=True)
        # Convert to datatime
        self.__df['Hour'] = pd.to_datetime(self.__df['Hour'], format='%H:%M:%S').dt.time
        self.__df.set_index(pd.to_datetime(self.__df['Hour'], format='%H:%M:%S'), inplace=True)
        # Filter time
        self.__df = self.__df.between_time('07:00:00', '18:00:00')
        self.__df.reset_index(drop=True, inplace=True)
        
    def saveDataset(self):
        ''' Saves processed dataset '''
        self.__df.to_csv('baselineProcessed.csv', index=False)

class CombineDatasets:
    
    def __init__(self):
        ''' Combines the datasets '''
        self.__df1 = pd.read_csv('baselineProcessed.csv')
        self.__df2 = pd.read_csv('missingData.csv')
        self.joinDatasets()
        self.multiDFs()

    def joinDatasets(self):
        ''' Set joins datasets '''
        # append df2 to the bottom of df1
        self.merged_df = self.__df1.append(self.__df2)
        # group by Date and Hour, and count the number of rows in each group
        counts = self.merged_df.groupby(['Date', 'Hour']).size().reset_index(name='count')
        # create a list of Date and Hour combinations that appear 3 or more times
        valid_combinations = counts[counts['count'] >= 3][['Date', 'Hour']]
        # remove rows from the original dataframe where the Date and Hour combination appears less than 3 times
        self.merged_df = self.merged_df.merge(valid_combinations, on=['Date', 'Hour'], how='inner')

        # result: dataframe with rows removed where the Date and Hour combination appears less than 3 times
        #print(self.merged_df)

    def pivotData(self,df):
        ''' Transposes data '''
        # use pivot_table to transpose the dataframe
        self.pivot_df = pd.pivot_table(df, values=['Cars_And_Taxis', 'Buses_and_Coaches', 'HGVs','LGVs','Motorbikes'], index=['Date', 'Hour'], columns=['Borough'], aggfunc='first')
        # flatten the column names to a single level
        self.pivot_df.columns = [f'{col[0]}_{col[1]}' for col in self.pivot_df.columns]
        # reset the index to create separate columns for Date and Hour
        self.pivot_df = self.pivot_df.reset_index()
        return self.pivot_df
    
    def multiDFs(self):
        ''' Seperate model is required for each model so... '''
        self.df_Croydon = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Croydon'])]).dropna(axis=0)
        self.df_Ealing = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Ealing'])]).dropna(axis=0)
        self.df_Greenwich = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Greenwich'])]).dropna(axis=0)
        self.df_Hillingdon = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hillingdon'])]).dropna(axis=0)
        self.df_Hounslow = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hounslow'])]).dropna(axis=0)
        self.df_Lambeth = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Lambeth'])]).dropna(axis=0)
        self.df_Hackney = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hackney'])]).dropna(axis=0)
        self.df_Hammersmith_and_Fulham = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hammersmith and Fulham'])]).dropna(axis=0)
        self.df_Lewisham = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Lewisham'])]).dropna(axis=0)
        self.df_Sutton = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Sutton'])]).dropna(axis=0)
        self.df_Bromley = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Bromley'])]).dropna(axis=0)
        self.df_Merton = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Merton'])]).dropna(axis=0)
        self.df_Tower_Hamlets = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Tower Hamlets'])]).dropna(axis=0)
        self.df_Wandsworth = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Wandsworth'])]).dropna(axis=0)
        self.df_Westminister = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Westminister'])]).dropna(axis=0)
        self.df_Brent = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Brent'])]).dropna(axis=0)
        self.df_Redbridge = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Redbridge'])]).dropna(axis=0)
        self.df_Waltham_Forest = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Waltham Forest'])]).dropna(axis=0)
        self.df_Havering = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Havering'])]).dropna(axis=0)
        self.df_Kensington_and_Chelsea = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Kensington and Cheslea'])]).dropna(axis=0)
        self.df_Richmond_upon_Thames = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Richmond upon Thames'])]).dropna(axis=0)
        self.df_Barnet = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Barnet'])]).dropna(axis=0)
        self.df_Kingston_upon_Thames = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Kingston upon Thames'])]).dropna(axis=0)
        self.df_City_of_London = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'City of London'])]).dropna(axis=0)
        self.df_Cambden = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Cambden'])]).dropna(axis=0)
        self.df_Barking_and_Dagenham = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Barking and Dagenham'])]).dropna(axis=0)
        self.df_Islington = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Islington'])]).dropna(axis=0)
        
        self.df_Croydon.name = 'Croydon'
        self.df_Ealing.name = 'Ealing'
        self.df_Greenwich.name = 'Greenwich'
        self.df_Hillingdon.name = 'Hillingdon'
        self.df_Hounslow.name = 'Hounslow'
        self.df_Lambeth.name = 'Lambeth'
        self.df_Hackney.name = 'Hackney'
        self.df_Hammersmith_and_Fulham.name = 'Hammersmith and Fulham'
        self.df_Lewisham.name = 'Lewisham'
        self.df_Sutton.name = 'Sutton'
        self.df_Bromley.name = 'Bromley'
        self.df_Merton.name = 'Merton'
        self.df_Tower_Hamlets.name = 'Tower Hamlets'
        self.df_Wandsworth.name = 'Wandsworth'
        self.df_Brent.name = 'Brent'
        self.df_Redbridge.name = 'Redbridge'
        self.df_Waltham_Forest.name = 'Waltham Forest'
        self.df_Havering.name = 'Havering'
        self.df_Richmond_upon_Thames.name = 'Richmond upon Thames'
        self.df_Barnet.name = 'Barnet'
        self.df_Kingston_upon_Thames.name = 'Kingston upon Thames'
        self.df_City_of_London.name = 'City of London'
        self.df_Barking_and_Dagenham.name = 'Barking and Dagenham'
        self.df_Islington.name = 'Islington'
        
        self.dfList = [
            self.df_Croydon,
            self.df_Ealing,
            self.df_Greenwich,
            self.df_Hillingdon,
            self.df_Hounslow,
            self.df_Lambeth, 
            self.df_Hackney,
            self.df_Hammersmith_and_Fulham,
            self.df_Lewisham,
            self.df_Sutton,
            self.df_Bromley,
            self.df_Merton,
            self.df_Tower_Hamlets,
            self.df_Wandsworth,
            self.df_Brent,
            self.df_Redbridge,
            self.df_Waltham_Forest,
            self.df_Havering,
            self.df_Richmond_upon_Thames,
            self.df_Barnet,
            self.df_Kingston_upon_Thames,
            self.df_City_of_London,
            self.df_Barking_and_Dagenham,
            self.df_Islington
        ]
        
    def saveDataset(self):
        ''' Saves processed dataset '''
        self.merged_df.to_csv('joinedDataset.csv', index=False)
        
    
class PredictBoroughVolume(CombineDatasets):
    
    def __init__(self):
        ''' Predicts traffic levels in each borough based on volume in our baseline dataset '''
        super().__init__()
        
        self.__predictor = pd.read_csv('baseline.csv')
        self.formatPredictor()
        self.filledBoroughs = []
        for borough in self.dfList:
            self.__df = borough
            self.boroughName = borough.name
            self.splitData()
            self.trainModel()
            self.fillMissing()
        self.combineList()
        self.saveData()
            
    def combineList(self): 
        ''' Combines the list of dataframes to a single for saving '''
        self.final = pd.concat(self.filledBoroughs)
        self.final = self.final.reindex(columns=['Date', 'Hour', 'Borough', 'Cars_And_Taxis','Buses_and_Coaches','Motorbikes','LGVs','HGVs'])
        # Remove decimals
        self.final.iloc[:, 3:8] = self.final.iloc[:, 3:8].astype(int)
        # Add all vehicles
        self.final['All_Vehicles'] = self.final.iloc[:, 3:8].sum(axis=1)
        print(self.final.head)
        
    def saveData(self):
        ''' Saves dataframe as csv files '''
        self.final.to_csv('finalVehicle.csv', index=False)

    def splitData(self):
        ''' Splits data into training and test set '''
        self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split(self.__df.iloc[:, [2,4,6,8,10]], self.__df.iloc[:, [3,5,7,9,11]], test_size=0.2, random_state=42)

    def trainModel(self):
        ''' Trains model '''
        self.__model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=42, max_iter=1000)
        self.__model.fit(self.__X_train, self.__y_train)
    
    def predict(self):
        ''' Used to predict and evaluate model '''
        #print(self.__X_test)
        y_pred = self.__model.predict(self.__X_test)
        r2 = r2_score(self.__y_test, y_pred)
        print(y_pred)
        #print("R-squared:", r2)
    
    def fillMissing(self):
        ''' Fill missing data for one borough '''
        columns = ['Buses_and_Coaches','Cars_And_Taxis','HGVs','LGVs','Motorbikes']
        self.df_predictions = (self.__model.predict(self.__predictorProcessed)).round()
        #print(self.df_predictions)
        self.df_predictions = pd.DataFrame(self.df_predictions, columns=columns)
        # Append Date and Starthour
        self.df_predictions = pd.concat([pd.DataFrame(self.__predictor[['Date','Starthour']]), self.df_predictions], axis=1, sort=False)
        self.df_predictions = self.formatTime(self.df_predictions)
        self.df_predictions = self.remTime(self.df_predictions)
        self.df_predictions['Date'] = pd.to_datetime(self.df_predictions['Date'])
        self.df_predictions = self.df_predictions[~(self.df_predictions['Date'].dt.year.isin([2017, 2022]))]
        #print(self.df_predictions)
        # Append Borough name
        self.df_predictions['Borough'] = self.boroughName 
        #print(self.df_predictions)
        self.filledBoroughs.append(self.df_predictions)
           
    def formatTime(self,df):
        ''' Changes time from 7 -> 07:00:00 etc '''
        df['Starthour'] = df['Starthour'].astype(str).str.zfill(2)
        df['Hour'] = df['Starthour'] + ':00:00'
        return df
        
    def remTime(self,df):
        ''' Excludes all times not between 7am and 6pm '''
        # Remove starthour
        df.drop('Starthour', axis=1, inplace=True)
        # Convert to datatime
        df['Hour'] = pd.to_datetime(df['Hour'], format='%H:%M:%S').dt.time
        df.set_index(pd.to_datetime(df['Hour'], format='%H:%M:%S'), inplace=True)
        # Filter time
        df = df.between_time('07:00:00', '18:00:00')
        df.reset_index(drop=True, inplace=True)
        return df
        
    def formatPredictor(self):
        ''' Orders predictor as Bus | Car | HGVs | LGVs | Motorbikes '''
        self.__predictorProcessed = self.__predictor[['Buses_and_Coaches','Cars_And_Taxis','HGVs','LGVs','Motorbikes']]
        

distVehicle = DistributionVehicles()
distVehicle.selectModel(LinearRegression())


'''
print('Linear Regression:')
distVehicle.evaluate()
'''
'''
p = ProcessData()
p.formatTime()
p.remTime()
p.saveDataset()
'''
#c = PredictBoroughVolume()


In [2]:
import pandas as pd
# Join vehicleFixed and vehicleFixed2 on date, hour and All_vehicle/All_vehicle2
df1 = pd.read_csv('vehicleFixed.csv')
df2 = pd.read_csv('vehicleFixed2.csv')

import pandas as pd

# If the column names are different in the two dataframes, you can specify them explicitly:
merged_df = pd.merge(df1, df2, left_on=['Hour', 'Date', 'All_Vehicles2'], right_on=['Hour', 'Date', 'All_Vehicles'])
merged_df = merged_df.drop("All_Vehicles2", axis=1)
merged_df = merged_df.drop("Unnamed: 0", axis=1)
print(merged_df.head)
merged_df.to_csv('tblVehicleFinal.csv', index=False)

<bound method NDFrame.head of                      Borough      Hour        Date        no2  Cars_And_Taxis  \
0       Barking and Dagenham  07:00:00  2018-01-01   4.374397           196.0   
1       Barking and Dagenham  08:00:00  2018-01-01   5.217581           216.0   
2       Barking and Dagenham  09:00:00  2018-01-01   8.640386           299.0   
3       Barking and Dagenham  10:00:00  2018-01-01   9.640889           323.0   
4       Barking and Dagenham  11:00:00  2018-01-01  10.846112           352.0   
...                      ...       ...         ...        ...             ...   
229645            Wandsworth  14:00:00  2021-12-31  37.946213          1001.0   
229646            Wandsworth  15:00:00  2021-12-31  40.464169          1062.0   
229647            Wandsworth  16:00:00  2021-12-31  46.568785          1208.0   
229648            Wandsworth  17:00:00  2021-12-31  36.202377           959.0   
229649            Wandsworth  18:00:00  2021-12-31  47.718706          1235.0  

In [5]:
import pandas as pd 
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.svm import SVR 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor 
import numpy 
from sklearn.metrics import r2_score

class DistributionVehicles:

    def __init__(self):
        ''' Predicts distribution of vehicles given just All_Vehicles '''
        self.__df = pd.read_csv('missingData.csv')
        self.__testData = pd.read_csv('vehicleFixed.csv')
        self.__otherCols = self.__testData[['Date','Hour']]
        self.__testData = self.__testData['All_Vehicles2'].to_numpy().reshape(-1, 1)
        #print(self.__df.head)
        self.splitData()

    def splitData(self):
        ''' Splits dataset into training and test sets '''
        self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split(self.__df[['All_Vehicles']], self.__df[['Cars_And_Taxis','Motorbikes','Buses_and_Coaches','LGVs','HGVs']], test_size=0.2, random_state=42)

    def selectModel(self,model):
        ''' Select Model '''
        self.__model = model
        self.trainModel()
        self.predict()

    def trainModel(self):
        ''' Train Model ''' 
        self.__model.fit(self.__X_train, self.__y_train)

    def predict(self):
        ''' Makes predictions using the trained model '''
        predictions = self.__model.predict(self.__testData)
        predictions = pd.DataFrame(predictions, columns=['Cars_And_Taxis','Motorbikes','Buses_and_Coaches','LGVs','HGVs']).round()
        for k, v in predictions.iteritems():
            v[v < 0] = 0
        print(predictions.head)
        predictions = pd.concat([pd.DataFrame(self.__otherCols),predictions,pd.DataFrame(self.__testData)],axis=1)
        predictions = predictions.rename(columns={predictions.columns[-1]: 'All_Vehicles'})
        #predictions['Borough'] = self.__testData['Borough']
        #print(predictions)
        self.write2File(predictions)

    def evaluate(self):
        ''' Evaluate the performance of the selected model '''
        r2 = r2_score(self.__y_test, self.__model.predict(self.__X_test))
        print("R-squared value:", r2)

    def write2File(self,dataframe):
        dataframe.to_csv('vehicleFixed2.csv', index=False)
        
    class ProcessData:

        def __init__(self):
            self.__df = pd.read_csv('baseline.csv')

        def formatTime(self):
            ''' Changes time from 7 -> 07:00:00 etc '''
            self.__df['Starthour'] = self.__df['Starthour'].astype(str).str.zfill(2)
            self.__df['Hour'] = self.__df['Starthour'] + ':00:00'
            # print(self.__df)

        def remTime(self):
            ''' Excludes all times not between 7am and 6pm '''
            # Remove starthour
            self.__df.drop('Starthour', axis=1, inplace=True)
            # Convert to datatime
            self.__df['Hour'] = pd.to_datetime(self.__df['Hour'], format='%H:%M:%S').dt.time
            self.__df.set_index(pd.to_datetime(self.__df['Hour'], format='%H:%M:%S'), inplace=True)
            # Filter time
            self.__df = self.__df.between_time('07:00:00', '18:00:00')
            self.__df.reset_index(drop=True, inplace=True)

        def saveDataset(self):
            ''' Saves processed dataset '''
            self.__df.to_csv('baselineProcessed.csv', index=False)
            
class CombineDatasets:

    def __init__(self):
        ''' Combines the datasets '''
        self.__df1 = pd.read_csv('baselineProcessed.csv')
        self.__df2 = pd.read_csv('missingData.csv')
        self.joinDatasets()
        self.multiDFs()

    def joinDatasets(self):
        ''' Set joins datasets '''
        # append df2 to the bottom of df1
        self.merged_df = self.__df1.append(self.__df2)
        # group by Date and Hour, and count the number of rows in each group
        counts = self.merged_df.groupby(['Date', 'Hour']).size().reset_index(name='count')
        # create a list of Date and Hour combinations that appear 3 or more times
        valid_combinations = counts[counts['count'] >= 3][['Date', 'Hour']]
        # remove rows from the original dataframe where the Date and Hour combination appears less than 3 times
        self.merged_df = self.merged_df.merge(valid_combinations, on=['Date', 'Hour'], how='inner')

        # result: dataframe with rows removed where the Date and Hour combination appears less than 3 times
        #print(self.merged_df)

    def pivotData(self,df):
        ''' Transposes data '''
        # use pivot_table to transpose the dataframe
        self.pivot_df = pd.pivot_table(df, values=['Cars_And_Taxis', 'Buses_and_Coaches', 'HGVs','LGVs','Motorbikes'], index=['Date', 'Hour'], columns=['Borough'], aggfunc='first')
        # flatten the column names to a single level
        self.pivot_df.columns = [f'{col[0]}_{col[1]}' for col in self.pivot_df.columns]
        # reset the index to create separate columns for Date and Hour
        self.pivot_df = self.pivot_df.reset_index()
        return self.pivot_df

    def multiDFs(self):
        ''' Seperate model is required for each model so... '''
        self.df_Croydon = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Croydon'])]).dropna(axis=0)
        self.df_Ealing = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Ealing'])]).dropna(axis=0)
        self.df_Greenwich = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Greenwich'])]).dropna(axis=0)
        self.df_Hillingdon = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hillingdon'])]).dropna(axis=0)
        self.df_Hounslow = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hounslow'])]).dropna(axis=0)
        self.df_Lambeth = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Lambeth'])]).dropna(axis=0)
        self.df_Hackney = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hackney'])]).dropna(axis=0)
        self.df_Hammersmith_and_Fulham = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Hammersmith and Fulham'])]).dropna(axis=0)
        self.df_Lewisham = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Lewisham'])]).dropna(axis=0)
        self.df_Sutton = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Sutton'])]).dropna(axis=0)
        self.df_Bromley = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Bromley'])]).dropna(axis=0)
        self.df_Merton = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Merton'])]).dropna(axis=0)
        self.df_Tower_Hamlets = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Tower Hamlets'])]).dropna(axis=0)
        self.df_Wandsworth = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Wandsworth'])]).dropna(axis=0)
        self.df_Westminister = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Westminister'])]).dropna(axis=0)
        self.df_Brent = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Brent'])]).dropna(axis=0)
        self.df_Redbridge = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Redbridge'])]).dropna(axis=0)
        self.df_Waltham_Forest = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Waltham Forest'])]).dropna(axis=0)
        self.df_Havering = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Havering'])]).dropna(axis=0)
        self.df_Kensington_and_Chelsea = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Kensington and Cheslea'])]).dropna(axis=0)
        self.df_Richmond_upon_Thames = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Richmond upon Thames'])]).dropna(axis=0)
        self.df_Barnet = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Barnet'])]).dropna(axis=0)
        self.df_Kingston_upon_Thames = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Kingston upon Thames'])]).dropna(axis=0)
        self.df_City_of_London = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'City of London'])]).dropna(axis=0)
        self.df_Cambden = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Cambden'])]).dropna(axis=0)
        self.df_Barking_and_Dagenham = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Barking and Dagenham'])]).dropna(axis=0)
        self.df_Islington = self.pivotData(self.merged_df[self.merged_df['Borough'].isin(['Marlybone Road', 'Islington'])]).dropna(axis=0)

        self.df_Croydon.name = 'Croydon'
        self.df_Ealing.name = 'Ealing'
        self.df_Greenwich.name = 'Greenwich'
        self.df_Hillingdon.name = 'Hillingdon'
        self.df_Hounslow.name = 'Hounslow'
        self.df_Lambeth.name = 'Lambeth'
        self.df_Hackney.name = 'Hackney'
        self.df_Hammersmith_and_Fulham.name = 'Hammersmith and Fulham'
        self.df_Lewisham.name = 'Lewisham'
        self.df_Sutton.name = 'Sutton'
        self.df_Bromley.name = 'Bromley'
        self.df_Merton.name = 'Merton'
        self.df_Tower_Hamlets.name = 'Tower Hamlets'
        self.df_Wandsworth.name = 'Wandsworth'
        self.df_Brent.name = 'Brent'
        self.df_Redbridge.name = 'Redbridge'
        self.df_Waltham_Forest.name = 'Waltham Forest'
        self.df_Havering.name = 'Havering'
        self.df_Richmond_upon_Thames.name = 'Richmond upon Thames'
        self.df_Barnet.name = 'Barnet'
        self.df_Kingston_upon_Thames.name = 'Kingston upon Thames'
        self.df_City_of_London.name = 'City of London'
        self.df_Barking_and_Dagenham.name = 'Barking and Dagenham'
        self.df_Islington.name = 'Islington'

        self.dfList = [
            self.df_Croydon,
            self.df_Ealing,
            self.df_Greenwich,
            self.df_Hillingdon,
            self.df_Hounslow,
            self.df_Lambeth, 
            self.df_Hackney,
            self.df_Hammersmith_and_Fulham,
            self.df_Lewisham,
            self.df_Sutton,
            self.df_Bromley,
            self.df_Merton,
            self.df_Tower_Hamlets,
            self.df_Wandsworth,
            self.df_Brent,
            self.df_Redbridge,
            self.df_Waltham_Forest,
            self.df_Havering,
            self.df_Richmond_upon_Thames,
            self.df_Barnet,
            self.df_Kingston_upon_Thames,
            self.df_City_of_London,
            self.df_Barking_and_Dagenham,
            self.df_Islington
        ]

    def saveDataset(self):
        ''' Saves processed dataset '''
        self.merged_df.to_csv('joinedDataset.csv', index=False)
        
class PredictBoroughVolume(CombineDatasets):

    def __init__(self):
        ''' Predicts traffic levels in each borough based on volume in our baseline dataset '''
        super().__init__()

        self.__predictor = pd.read_csv('baseline.csv')
        self.formatPredictor()
        self.filledBoroughs = []
        for borough in self.dfList:
            self.__df = borough
            self.boroughName = borough.name
            self.splitData()
            self.trainModel()
            self.fillMissing()
        self.combineList()
        self.saveData()

    def combineList(self): 
        ''' Combines the list of dataframes to a single for saving '''
        self.final = pd.concat(self.filledBoroughs)
        self.final = self.final.reindex(columns=['Date', 'Hour', 'Borough', 'Cars_And_Taxis','Buses_and_Coaches','Motorbikes','LGVs','HGVs'])
        # Remove decimals
        self.final.iloc[:, 3:8] = self.final.iloc[:, 3:8].astype(int)
        # Add all vehicles
        self.final['All_Vehicles'] = self.final.iloc[:, 3:8].sum(axis=1)
        print(self.final.head)

    def saveData(self):
        ''' Saves dataframe as csv files '''
        self.final.to_csv('finalVehicle.csv', index=False)

    def splitData(self):
        ''' Splits data into training and test set '''
        self.__X_train, self.__X_test, self.__y_train, self.__y_test = train_test_split(self.__df.iloc[:, [2,4,6,8,10]], self.__df.iloc[:, [3,5,7,9,11]], test_size=0.2, random_state=42)

    def trainModel(self):
        ''' Trains model '''
        self.__model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=42, max_iter=1000)
        self.__model.fit(self.__X_train, self.__y_train)

    def predict(self):
        ''' Used to predict and evaluate model '''
        #print(self.__X_test)
        y_pred = self.__model.predict(self.__X_test)
        r2 = r2_score(self.__y_test, y_pred)
        print(y_pred)
        #print("R-squared:", r2)

    def fillMissing(self):
        ''' Fill missing data for one borough '''
        columns = ['Buses_and_Coaches','Cars_And_Taxis','HGVs','LGVs','Motorbikes']
        self.df_predictions = (self.__model.predict(self.__predictorProcessed)).round()
        #print(self.df_predictions)
        self.df_predictions = pd.DataFrame(self.df_predictions, columns=columns)
        # Append Date and Starthour
        self.df_predictions = pd.concat([pd.DataFrame(self.__predictor[['Date','Starthour']]), self.df_predictions], axis=1, sort=False)
        self.df_predictions = self.formatTime(self.df_predictions)
        self.df_predictions = self.remTime(self.df_predictions)
        self.df_predictions['Date'] = pd.to_datetime(self.df_predictions['Date'])
        self.df_predictions = self.df_predictions[~(self.df_predictions['Date'].dt.year.isin([2017, 2022]))]
        #print(self.df_predictions)
        # Append Borough name
        self.df_predictions['Borough'] = self.boroughName 
        #print(self.df_predictions)
        self.filledBoroughs.append(self.df_predictions)

    def formatTime(self,df):
        ''' Changes time from 7 -> 07:00:00 etc '''
        df['Starthour'] = df['Starthour'].astype(str).str.zfill(2)
        df['Hour'] = df['Starthour'] + ':00:00'
        return df

    def remTime(self,df):
        ''' Excludes all times not between 7am and 6pm '''
        # Remove starthour
        df.drop('Starthour', axis=1, inplace=True)
        # Convert to datatime
        df['Hour'] = pd.to_datetime(df['Hour'], format='%H:%M:%S').dt.time
        df.set_index(pd.to_datetime(df['Hour'], format='%H:%M:%S'), inplace=True)
        # Filter time
        df = df.between_time('07:00:00', '18:00:00')
        df.reset_index(drop=True, inplace=True)
        return df

    def formatPredictor(self):
        ''' Orders predictor as Bus | Car | HGVs | LGVs | Motorbikes '''
        self.__predictorProcessed = self.__predictor[['Buses_and_Coaches','Cars_And_Taxis','HGVs','LGVs','Motorbikes']]
        
distVehicle = DistributionVehicles() 
distVehicle.selectModel(LinearRegression())

''' print('Linear Regression:') distVehicle.evaluate() ''' ''' p = ProcessData() p.formatTime() p.remTime() p.saveDataset() '''

#c = PredictBoroughVolume()

<bound method NDFrame.head of         Cars_And_Taxis  Motorbikes  Buses_and_Coaches   LGVs  HGVs
0                196.0        14.0               18.0   33.0   0.0
1                216.0        14.0               18.0   39.0   0.0
2                299.0        17.0               19.0   62.0   3.0
3                323.0        17.0               19.0   68.0   6.0
4                352.0        18.0               19.0   76.0   9.0
...                ...         ...                ...    ...   ...
227911          1001.0        38.0               25.0  256.0  75.0
227912          1062.0        39.0               26.0  273.0  81.0
227913          1208.0        44.0               27.0  314.0  96.0
227914           959.0        36.0               25.0  245.0  71.0
227915          1235.0        45.0               27.0  321.0  99.0

[227916 rows x 5 columns]>


" print('Linear Regression:') distVehicle.evaluate()  p = ProcessData() p.formatTime() p.remTime() p.saveDataset() "