# 2022 Data Cleaning and Aggregation


In [1]:
import pandas as pd
import numpy as np
import os
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import helpers
geolocator = Nominatim(user_agent="epra")
%load_ext dotenv
%dotenv

In [2]:
BASE_URL_RAW = os.environ.get('BASE_URL_RAW')
BASE_URL = os.environ.get('BASE_URL')

### Load files

Due to different file formats. Each file downloaded is treated as am individual data source.
A lot of manual cleaning was done using Excel


In [3]:
# Load all the data files
jan = pd.read_csv(f'{BASE_URL_RAW}/15th-January-14th-February-2022.csv')
feb = pd.read_csv(f'{BASE_URL_RAW}/15th-February-14th-March-2022_Website.csv')
march = pd.read_csv(f'{BASE_URL_RAW}/15th-March-2022-to-April-2022.csv')
april = pd.read_csv(f'{BASE_URL_RAW}/15th-April-2022-14th-May-2022.csv')
may = pd.read_csv(f'{BASE_URL_RAW}/Pump-Prices11-15-May-14-June-2022_Website.csv')
june = pd.read_csv(f'{BASE_URL_RAW}/Pump-Prices-15-June-14-July-2022_Website.csv')
july = pd.read_csv(f'{BASE_URL_RAW}/Prices-july-to-August-2022.csv')

august = pd.read_csv(f'{BASE_URL_RAW}/Prices-August-to-September-2022.csv')
sept = pd.read_csv(f'{BASE_URL_RAW}/Prices-september-to-october-2022.csv')
oct1 = pd.read_csv(f'{BASE_URL_RAW}/Prices-Oct-to-Nov-2022.csv')
nov = pd.read_csv(f'{BASE_URL_RAW}/Prices-15th-Nov-14th-Dec-2022.csv')
dec = pd.read_csv(f'{BASE_URL_RAW}/15th-December-2022-14th-January-2023.csv')


In [4]:
"""Prepare data for the first half of the year since the data is in the same format"""
def prep_data_jan_apr(data_list):
    dfs = []
    for data in data_list:
        # remove whatespaces in columns
        df = helpers.remove_column_whitespace(data)

        # clean the date
        df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')
        # Fix typos
        df = helpers.rename_towns(df)
        # Get town coordinates
        df = helpers.get_town_coordinates(df, 'Town')
        #  drop towns without coords
        df = df.dropna()
        dfs.append(df)
    combined = pd.concat(dfs)
    return combined
"""Prepare data for the first half of the year since the data is in the same format"""
def prep_data_may_july(data_list):
    dfs = []
    for data in data_list:
        # rename the coulmns using row 1
        headers = data.iloc[0]
        new_df  = pd.DataFrame(data.values[1:], columns=headers)
        new_df = new_df.reset_index(drop=True)
        new_df=new_df.dropna(axis='columns')
        
        # remove whatespaces in columns
        df = helpers.remove_column_whitespace(new_df)

        # clean the date
        df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')
        # Fix typos
        df = helpers.rename_towns(df)
        # Get town coordinates
        df = helpers.get_town_coordinates(df, 'Town')
        #  drop towns without coords
        df = df.dropna()
        dfs.append(df)
    combined = pd.concat(dfs)
    return combined
"""Prepare data for the second half of the year since the data is in the same format"""
def prep_data_jul_dec(data_list):
    dfs = []
    for data in data_list:
        period = data.columns[1]

        data.rename(columns={'Unnamed: 0': 'Price_Period', f'{period}': 'Town',
                    'MAXIMUM PUMP PRICES': 'Super', 'Unnamed: 3': 'Diesel', 'Unnamed: 4': 'Kerosene'}, inplace=True)
        print(period)
        df = data.dropna()
        # df = data

        # set the period
        df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')

        # clean the date
        df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')
        # # Fix town typos
        df = helpers.rename_towns(df)
        # # Get town coordinates
        df = helpers.get_town_coordinates(df, 'Town')
        # #  drop towns without coords
        df = df.dropna()
        dfs.append(df)
    combined = pd.concat(dfs)
    return combined


In [5]:
# # Prepare data for the second part of the year
first_half_2022_1 = prep_data_jan_apr([jan, feb, march, april])
first_half_2022_1


No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 


Unnamed: 0,Price_Period,Town,Super,Diesel,Kerosene,lat,lon
0,15-Jan-2022 - 14-Feb-2022,Mombasa,127.46,108.36,101.29,-4.05052,39.667169
1,15-Jan-2022 - 14-Feb-2022,Kilifi,128.17,109.07,102.01,-3.15073925,39.67507159193717
2,15-Jan-2022 - 14-Feb-2022,Likoni,127.82,108.71,101.65,-4.1027192,39.64540380366174
3,15-Jan-2022 - 14-Feb-2022,Kwale,127.82,108.71,101.65,-4.1836067,39.105094975232994
4,15-Jan-2022 - 14-Feb-2022,Malindi,128.39,109.27,102.21,-3.2165987,40.1165933
...,...,...,...,...,...,...,...
217,15-Apr-2022 - 14-May-2022,Sengera,146.25,127.42,115.38,-0.8556582,34.71132247284744
218,15-Apr-2022 - 14-May-2022,Kiambu,145.98,127.14,115.11,-1.0363950499999999,36.843131163110776
219,15-Apr-2022 - 14-May-2022,Marani,145.68,126.85,114.81,-0.5661617999999999,34.81321148475658
221,15-Apr-2022 - 14-May-2022,Ikonge,145.40,126.56,114.53,-0.5333,35.0166


In [6]:
# Prepare data for the second part of the year
first_half_2022_2 = prep_data_may_july([may,june])
first_half_2022_2

No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 
No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 


Unnamed: 0,Price_Period,Town,Super,Diesel,Kerosene,lat,lon
0,15-May-2022 - 14-Jun-2022,Mombasa,147.86,128.76,116.69000000000001,-4.05052,39.667169
1,15-May-2022 - 14-Jun-2022,Kilifi,148.57,129.47,117.41000000000001,-3.15073925,39.67507159193717
2,15-May-2022 - 14-Jun-2022,Likoni,148.22,129.11,117.05000000000001,-4.1027192,39.64540380366174
3,15-May-2022 - 14-Jun-2022,Kwale,148.22,129.11,117.05000000000001,-4.1836067,39.105094975232994
4,15-May-2022 - 14-Jun-2022,Malindi,148.78,129.67000000000002,117.61,-3.2165987,40.1165933
...,...,...,...,...,...,...,...
217,15-Jun-2022 - 14-Jul-2022,Sengera,160.75,141.92000000000002,129.88,-0.8556582,34.71132247284744
218,15-Jun-2022 - 14-Jul-2022,Kiambu,160.48000000000002,141.64,129.61,-1.0363950499999999,36.843131163110776
219,15-Jun-2022 - 14-Jul-2022,Marani,160.18,141.35000000000002,129.31,-0.5661617999999999,34.81321148475658
221,15-Jun-2022 - 14-Jul-2022,Ikonge,159.9,141.07,129.03,-0.5333,35.0166


In [7]:
# Prepare data for the second part of the year
second_half_2022 = prep_data_jul_dec([july,august, sept, oct1, nov, dec])
second_half_2022.head()


15th July 2022 –14th August 2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')


No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 
15th August 2022 –14th September 2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')


No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 
15th September 2022 –14th October 2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')


No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 
15th October 2022 –14th November 2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')


No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 
15th November 2022 – 14th December 2022


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')


No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 
15th December 2022 –14th January 2023


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = df['Price_Period'].apply(lambda s: f'{period}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price_Period'] = helpers.sanitize_date(df, 'Price_Period')


No lat long: Wundanji 
No lat long: Doldo 
No lat long: Mutuobare 
No lat long: Loyangalani 
No lat long: Burnt Forest 
No lat long: Nakalale 
No lat long: Songor 
No lat long: Sambalat 
No lat long: Asilong 
No lat long: Chepkorniswo 
No lat long: Kaspokwony 
No lat long: SioPort 
No lat long: Port Bunyala 
No lat long: Kerina 


Unnamed: 0,Price_Period,Town,Super,Diesel,Kerosene,lat,lon
2,15-Jul-2022 - 14-Aug-2022,Mombasa,156.86,137.76,125.69,-4.05052,39.667169
3,15-Jul-2022 - 14-Aug-2022,Kilifi,157.57,138.47,126.41,-3.15073925,39.67507159193717
4,15-Jul-2022 - 14-Aug-2022,Likoni,157.22,138.11,126.05,-4.1027192,39.64540380366174
5,15-Jul-2022 - 14-Aug-2022,Kwale,157.22,138.11,126.05,-4.1836067,39.105094975232994
6,15-Jul-2022 - 14-Aug-2022,Malindi,157.78,138.67,126.61,-3.2165987,40.1165933


In [8]:
# Combine the dataframes to create the 2022 dataset
year_2022 = pd.concat([first_half_2022_1,first_half_2022_2, second_half_2022])
year_2022.head()

Unnamed: 0,Price_Period,Town,Super,Diesel,Kerosene,lat,lon
0,15-Jan-2022 - 14-Feb-2022,Mombasa,127.46,108.36,101.29,-4.05052,39.667169
1,15-Jan-2022 - 14-Feb-2022,Kilifi,128.17,109.07,102.01,-3.15073925,39.67507159193717
2,15-Jan-2022 - 14-Feb-2022,Likoni,127.82,108.71,101.65,-4.1027192,39.64540380366174
3,15-Jan-2022 - 14-Feb-2022,Kwale,127.82,108.71,101.65,-4.1836067,39.105094975232994
4,15-Jan-2022 - 14-Feb-2022,Malindi,128.39,109.27,102.21,-3.2165987,40.1165933


In [9]:
# write export it as a csv
year_2022.to_csv(f'{BASE_URL}/combined.csv',index=False)
