# Data types and built-in data structures using the example of car data

## Libraries and settings

In [None]:
# Libraries
import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set background color of graphics to dark
# plt.style.use('dark_background')

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

# Import car data

In [None]:
# Read the data
df_orig = pd.read_csv('autoscout24_data.csv', sep=";")

# Show the number of rows and columns
print(df_orig.shape)

# Print the first 10 rows and a selected number of columns
df_orig.iloc[:10,1:12]

## Show data types

In [None]:
# Show data types
print(df_orig.dtypes)

## Count and remove missing values

In [None]:
# Count missing values
print('Count missing values')
print(df_orig.isna().sum())

# Remove missing values
print('\nCheck whether missing values were successfully removed')
df_clean_01 = df_orig.dropna()
df_clean_01.isna().sum()

## Count and remove duplicated values

In [None]:
# Count duplicated values
print('Count duplicated values')
print(df_clean_01.duplicated().sum())

# Remove duplicated values
print('\nCheck whether duplicated values were successfully removed')
df = df_clean_01.drop_duplicates()
print(df_clean_01.duplicated().sum())

## Create additional variables using string-manipulation functions

### Extract make from car type

In [None]:
# Extract the first word from each string
df['Make'] = df['Type'].str.split().str[0]

# Explanation:
# .split() splits a string, e.g.
# txt = 'AUDI A5 Sportback 3.0 TDI quattro S-tronic'
# txt.split()
# ['AUDI', 'A5', 'Sportback', '3.0', 'TDI', 'quattro', 'S-tronic']
# .str[0] returns the 1st item of the list which is 'AUDI'

# Show result
df[['Type', 'Make']]

# Count the number of cars per 'Make'
df['Make'].value_counts()

### Crosstab Make versus Fuel Type

In [None]:
# Makes with highest number of cars
top_ten = list(df['Make'].value_counts().index[:10])

# Create subset of the data
df_sub = df.loc[(df['Make'].isin(top_ten)) &
                (df['Fuel_Type'].isin(['Diesel', 'Benzin']))]

# Cross table
pd.crosstab(df_sub.Make, df_sub.Fuel_Type)

### Extract numerical values from 'Mileage'

In [None]:
# Extract values using the lambda function in combination with a regular expression
df['Mileage_num'] = df['Mileage'].apply(lambda x: int(re.sub('[^\d]', '', x)))

# Explanation:
# The lambda function iterates over df['Mileage'] in which x is a single element 
# The regular expression '[^\d]' identifies all non numerical characters in x
# The .sub() method replaces the non numerical characters like 'CHF' with ''
# The int() converts the resulting string to an integer

# Show result
df[['Mileage', 'Mileage_num']]

### Histogram of Mileage

In [None]:
# Histogram
plt.figure(figsize=(6,4))
df['Mileage_num'].plot.hist(grid=True, 
                        bins=20, 
                        rwidth=0.9,
                        color='lightseagreen')
plt.title('Mileage')
plt.xlabel('Mileage (km)')
plt.ylabel('Frequency')
plt.grid(linestyle='-', linewidth=0.1)

### Extract numerical values from Horse Power

In [None]:
# Extract values using the lambda function in combination with a regular expression
df['HP_num'] = df['HP'].apply(lambda x: int(re.sub('[^\d]', '', x)))

# Show result
df[['HP', 'HP_num']]

### Histogram of Horsepower

In [None]:
# Histogram
plt.figure(figsize=(6,4))
df['HP_num'].plot.hist(grid=True, 
                        bins=20, 
                        rwidth=0.9,
                        color='lightseagreen')
plt.title('Horsepower')
plt.xlabel('HP')
plt.ylabel('Frequency')
plt.grid(linestyle='-', linewidth=0.1)

### Extract numerical values from Price

In [None]:
# Extract values using the lambda function in combination with a regular expression
df['Price_num'] = df['Price'].apply(lambda x: int(re.sub('[^\d]', '', x)))

# Show result
df[['Price', 'Price_num']]

### Boxplot of price

In [None]:
# Boxplot
plt.figure(figsize=(7,1.2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=df['Price_num'], color="lightseagreen")

## Create additional variables through calculations

### Calculate the Price per Horsepower

In [None]:
# Calculate prices
df['Price_per_HP'] = round(df['Price_num'] / df['HP_num'], 1)

# Show result
df[['Price_num', 'HP_num', 'Price_per_HP']]

### Boxplot of Price per Horsepower

In [None]:
# Boxplot of price per horsepower
plt.figure(figsize=(7,1.2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=df['Price_per_HP'], color="lightseagreen")

## Show data types of original and new variables

In [None]:
# Show data types (note that an 'object' is a string in pandas)
print(df.dtypes)

## Show summary statistics of numerical variables

In [None]:
# Summary statistics of numeric variables
df.describe()

# Explanation:
# count = number of observations
# mean = mean
# min = minimum
# std = standard deviation
# 25% = 1st quartile
# 50% = median
# 75% = 3rd quartile
# max = maximum


## Use a list to store values from a calulation

In [None]:
# Define a function to calculate the value added tax for a given price
def value_added_tax(price, tax_rate=0.077):
    return price * tax_rate

# Calculate the value added tax using list comprehension
tax_results = [value_added_tax(price) for price in df['Price_num']]

# Show first 10 values of the list
print(tax_results[:10])

# Note that, if you have a data frame, you can also use .apply(), e.g.:
# df['Tax_results'] = df['Price_num'].apply(value_added_tax)
# print(df['Tax_results'])


## Create a dictionary with all prices per make

In [None]:
# Use set and dictionary comprehension to create a dictionary
car_dict = {make: df.loc[df['Make'] == make, 'Price_num'].tolist() \
            for make in set(df['Make'])}

# Print the resulting dictionary keys and values
print('Dictionary with keys and values:')
print(car_dict)

# Print the resulting dictionary keys
print('\nDictionary keys:')
print(car_dict.keys())

# Print the resulting dictionary values
print('\nDictionary values')
print(car_dict.values())

## Access elements of the dictionary with all prices per make

In [None]:
# Define the key
mykey = 'FERRARI'

# Check whether FERRARI is a key in car_dict
if mykey in car_dict:
    print(f'The key {mykey} exists!')

# Get all values from the key
print(f'\nHere are the available prices of the key {mykey}:')
print(car_dict.get(mykey))

## Use a set to create a list of unique makes

In [None]:
# Create a set with unique values
unique_makes_set = set(df['Make'])

# Create a list from the set
unique_makes_list = list(unique_makes_set)

# Sort the list
unique_makes_list_sorted = sorted(unique_makes_list)

# Print the result
print(unique_makes_list_sorted)

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')