# Preparation and exploration of bicycle / bicycle-product data from tutti.ch

## Libraries and settings

In [None]:
# Libraries
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Import bicycle data

In [None]:
# Read the data
df_orig = pd.read_csv('bicycles.csv', sep=",")

# Show the number of rows and columns
print(df_orig.shape)

# Print the first 10 rows and a selected number of columns
df_orig.iloc[:10,]

## Show data types

In [None]:
# Show data types
print(df_orig.dtypes)

## Count and remove missing values

In [None]:
# Count missing values
print('Count missing values')
print(df_orig.isna().sum())

# Remove missing values
print('\nCheck whether missing values were successfully removed')
df_clean_01 = df_orig.dropna()
df_clean_01.isna().sum()

## Count and remove duplicated values

In [None]:
# Count duplicated values
print('Count duplicated values')
print(df_clean_01.duplicated().sum())

# Remove duplicated values
print('\nCheck whether duplicated values were successfully removed')
df = df_clean_01.drop_duplicates()
print(df.duplicated().sum())

## Create a new variable 'price' with an 'int' as data type

In [None]:
# Extract values using the lambda function
df['price'] = df['price_raw'].apply(lambda x: re.sub('[^\d]', '', x))

# Remove records with no prices
df_sub = df[df['price'] != '']

# Change data type to int
df_sub['price'] = df_sub['price'].astype('int32')

# Check data type
print(df_sub['price'].dtypes)

## Create a subset of the data frame with prices between 200.- and 2000.- CHF

In [None]:
df_sub_02 = df_sub.loc[(df_sub['price'] >= 200) & (df_sub['price'] <= 2000)]
df_sub_02

## Histogramm of bicycle prices

In [None]:
# Histogram
plt.figure(figsize=(6,4))
df_sub_02['price'].plot.hist(grid=True, 
                        bins=20, 
                        rwidth=0.9,
                        color='lightseagreen')
plt.title('Bicycle prices')
plt.xlabel('Price (CHF)')
plt.ylabel('Frequency')
plt.grid(linestyle='-', linewidth=0.1)

##  Save the name of the city / canton from the address string in a new variable

In [None]:
# Extract city / canton (note the split(',') which considering commas)
df_sub_02['location'] = df['address_raw'].str.split(',').str[0]

# Show result
df_sub_02[['address_raw', 'location']]

## Create a dictionary with all prices per city / canton

In [None]:
# Use set and dictionary comprehension to create a dictionary
bicycle_dict = {x: df_sub_02.loc[df_sub_02['location'] == x, 'price'].tolist() \
               for x in set(df_sub_02['location'])}

# Print the resulting dictionary keys and values
print('Dictionary with keys and values:')
print(bicycle_dict)

# Print the resulting dictionary keys
print('\nDictionary keys:')
print(bicycle_dict.keys())

# Print the resulting dictionary values
print('\nDictionary values')
print(bicycle_dict.values())

## Access elements of the dictionary with all prices in the city of Zürich

In [None]:
# Define the key
mykey = 'Zürich'

# Check whether FERRARI is a key in car_dict
if mykey in bicycle_dict:
    print(f'The key {mykey} exists!')

# Get all values from the key
print(f'\nHere are the available prices of the key {mykey}:')
print(bicycle_dict.get(mykey))

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')