In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_dir = './data'

In [3]:
data_file = os.path.join(data_dir, 'london_merged.csv')

df = pd.read_csv(data_file)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


In [5]:
df_clean = df.copy()

In [6]:
column_map = {
    'season': 'season_code',
    'cnt': 'shared_counts',
    't1': 'real_temperature',
    't2': 'temperature',
    'hum': 'humidity'
}

df_clean.rename(columns=column_map, inplace=True)

In [7]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   timestamp         17414 non-null  object 
 1   shared_counts     17414 non-null  int64  
 2   real_temperature  17414 non-null  float64
 3   temperature       17414 non-null  float64
 4   humidity          17414 non-null  float64
 5   wind_speed        17414 non-null  float64
 6   weather_code      17414 non-null  float64
 7   is_holiday        17414 non-null  float64
 8   is_weekend        17414 non-null  float64
 9   season_code       17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


In [8]:
df_clean['timestamp'].unique()

array(['2015-01-04 00:00:00', '2015-01-04 01:00:00',
       '2015-01-04 02:00:00', ..., '2017-01-03 21:00:00',
       '2017-01-03 22:00:00', '2017-01-03 23:00:00'], dtype=object)

In [9]:
df_clean['date'] = df_clean['timestamp'].str.split(" ").str[0]

In [10]:
df_clean['is_weekend'] = pd.to_numeric(df_clean['is_weekend'], downcast='integer')
df_clean['is_holiday'] = pd.to_numeric(df_clean['is_holiday'], downcast='integer')
df_clean['weather_code'] = pd.to_numeric(df_clean['weather_code'], downcast='integer').astype(str)
df_clean['season_code'] = pd.to_numeric(df_clean['season_code'], downcast='integer').astype(str)

In [11]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   timestamp         17414 non-null  object 
 1   shared_counts     17414 non-null  int64  
 2   real_temperature  17414 non-null  float64
 3   temperature       17414 non-null  float64
 4   humidity          17414 non-null  float64
 5   wind_speed        17414 non-null  float64
 6   weather_code      17414 non-null  object 
 7   is_holiday        17414 non-null  int8   
 8   is_weekend        17414 non-null  int8   
 9   season_code       17414 non-null  object 
 10  date              17414 non-null  object 
dtypes: float64(4), int64(1), int8(2), object(4)
memory usage: 1.2+ MB


In [12]:
season_map = {
    '0': 'spring',
    '1': 'summer',
    '2': 'fall',
    '3': 'winter',
}

df_clean['season'] = df_clean['season_code'].map(lambda x: season_map[x])

In [13]:
truth_arr = ['False', 'True']

df_clean['is_holiday'] = df_clean['is_holiday'].map(lambda x: truth_arr[x])
df_clean['is_weekend'] = df_clean['is_weekend'].map(lambda x: truth_arr[x])

In [14]:
df_clean

Unnamed: 0,timestamp,shared_counts,real_temperature,temperature,humidity,wind_speed,weather_code,is_holiday,is_weekend,season_code,date,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3,False,True,3,2015-01-04,winter
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1,False,True,3,2015-01-04,winter
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1,False,True,3,2015-01-04,winter
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1,False,True,3,2015-01-04,winter
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1,False,True,3,2015-01-04,winter
...,...,...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,1042,5.0,1.0,81.0,19.0,3,False,False,3,2017-01-03,winter
17410,2017-01-03 20:00:00,541,5.0,1.0,81.0,21.0,4,False,False,3,2017-01-03,winter
17411,2017-01-03 21:00:00,337,5.5,1.5,78.5,24.0,4,False,False,3,2017-01-03,winter
17412,2017-01-03 22:00:00,224,5.5,1.5,76.0,23.0,4,False,False,3,2017-01-03,winter


In [15]:
filepath = os.path.join(data_dir, 'london.csv')

In [16]:
df_clean.to_csv(filepath)