# Statista data
Statista Consumer Insights: https://www.statista.com/global-consumer-survey 

The data is from the global consumer insights survey, therefore there might be a potential bias that people do not answer truthfully because the assessment of peoples habits might not be a perfect representation of reality. Since statista only provides the aggregated data it is not really possible to clean the data by e.g. filtering out nonsense answers, but it can be expected that Statista did this.

As a first step the data was parsed into a pandas dataframe to be able to work with it in Python. This was not that easy since the csv is quite messy, therefore the lines were read and based on the raw text a dataframe was generated.

All the survey questions were parsed in the same format: As an index the Year, Update and the Country were used. In order to have not overlapping data only Update 1 was chosen. The countries were filtered to have the same set of countries for every dataset, since different questions were asked in different countries. The answers are stored in the columns, the rows store the percentage of people who chose the corresponding answer for a specific country. Some of the questions were multi pick questions which result in a total percentage of more than 100%.

Because the data is all in the same format, two methods are used for plotting the data, one for barplots and one for scatterplots. They picture the same data in different ways. The scatterplot are good to indicate trends over time and the barplots are useful to have a look at the overall distribution of many different categories (in this case answers to questions).

The plots for the different questions were used to gain an overview of the data, to determine possible trends or to identify the percentage split of modes of transportation. It is quite surprising that the most popular answer to the question "Question: How often do you use the following modes of transportation? (Local public transportation)" is "Not at all". Given that there is the option "Less often than every 2-3 months" we would have expected that people rather chose this answer instead of "Not at all". Furthermore it was astonishing that about 50% of people in France do not use public transport at all. It could be that the wording local public transport leaves some room for interpretation, since it is not defined in more detail in the questionnaire.

The transportation usage frequency was limited to local transportation, bicycle and car because the other modes of transport were relatively unpopular therefore the majority of answers would be no usage at all.

In [None]:
import pandas as pd
import plotly.express as px
import glob
import numpy as np
from io import StringIO
from typing import List

from plotly.subplots import make_subplots
from plotly import graph_objects as go

In [None]:
data_root = '../data/statista/'

file_lst = glob.glob(data_root + '*.csv')


In [None]:
def parse_statista_csv(filepath: str) -> pd.DataFrame:
    """Reads a csv file downloaded from statista.com and returns a pandas dataframe

    Args:
        filepath (str): Filepath of the csv file

    Returns:
        pd.DataFrame: Pandas dataframe with the data
    """
    with open(filepath, 'r') as f:
        lines = f.readlines()
    # print question:
    print(f'Question: {lines[4]}')
    update_indices = [idx for idx, line in enumerate(lines) if ' - Update' in line]
    # create pairs from the above list
    pairs = list(zip(update_indices, update_indices[1:]))
    # Filter out short pairse because they do not contain any data
    pairs = [pair for pair in pairs if abs(pair[1] - pair[0]) > 4]
    df_lst = []
    for update_start, update_end in pairs:
        year, update = lines[update_start].strip().replace('Update ', '').replace('"', '').split('-')
        # print(f'Processing {year} {update}')
        year = int(year)
        update = int(update)
        start_idx = update_start + 2
        end_idx = update_end - 2
        csv_str = ''.join(lines[start_idx:end_idx])
        # print(csv_str)
        
        df = pd.read_csv(StringIO(csv_str), sep=';', header=None).T

        # Filter rows based on column: 1
        df = df[(df[1] == "absolute") | (df[1].isna())]

        # Drop column: '1'
        df = df.drop(columns=[1])
        df.loc[0, 0] = 'Country'
        df.columns = df.iloc[0]
        df = df[1:]
        df['Year'] = year
        df['Update'] = update
        df.set_index(['Year', 'Update', 'Country'], inplace=True)

        for col in df.columns:
            # float to be able to assign null
            df[col] = df[col].str.split(' / ').str[0].str.replace(',', '').astype(float)
        df_lst.append(df)
        for col in df.columns:
            if col != 'Base':
                df[col] = df[col] / df['Base'] * 100
                
    df = pd.concat(df_lst)
    country_lst = ['Finland', 'France', 'Italy', 'Spain', 'Germany', 'Switzerland', 'Sweden', 'Austria', 'Poland']
    df = df.query('Country in @country_lst')

    return df

In [None]:
def create_bar_plot(df: pd.DataFrame, title: str, cols_to_plot: List[str] = None, barmode: str = 'group') -> go.Figure:
    plot_df = df.copy()
    plot_df.reset_index(inplace=True)
    if cols_to_plot is None:
        cols_to_plot = plot_df.columns.tolist()[4:]
    max_cols = 4
    n_rows = int(np.ceil(len(plot_df['Country'].unique())/max_cols))
    plot_titles = plot_df['Country'].unique()
    fig = make_subplots(rows=n_rows, cols=max_cols, subplot_titles=plot_titles, shared_yaxes=True, shared_xaxes=True)
    fig.update_layout(title=title, legend_title='Answer')
    cur_row = 1
    cur_col = 1
    first = True
    color_palette = px.colors.qualitative.Dark24
    if cols_to_plot is None:
        cols_to_plot = plot_df.columns.tolist()[4:]
    for country in plot_df['Country'].unique():
        df = plot_df.query('Country == @country')
        for i, col in enumerate(cols_to_plot):
            fig.add_trace(go.Bar(x=df['Year'], y=df[col], name=col, marker=dict(color=color_palette[i]), showlegend=first, legendgroup=col), row=cur_row, col=cur_col)
            fig.update_xaxes(title_text="Year", row=cur_row, col=cur_col)
            fig.update_yaxes(title_text="Percentage", row=cur_row, col=cur_col)
        first = False
        cur_col += 1
        if cur_col > max_cols:
            cur_row += 1
            cur_col = 1
    layout_dict = {}
    for i in range(len(plot_df['Country'].unique())):
        if i == 0:
            layout_dict['xaxis_showticklabels'] = True
            layout_dict['yaxis_showticklabels'] = True
        else:
            layout_dict[f'xaxis{i}_showticklabels'] = True
            layout_dict[f'yaxis{i}_showticklabels'] = True
    fig.update_layout(legend_title='Answer', barmode=barmode, margin=dict(l=20, r=20, t=100, b=20), height=600)
    fig.update_layout(layout_dict)

    return fig

In [None]:
def create_scatter_plot(df: pd.DataFrame, title: str, cols_to_plot: List[str] = None) -> go.Figure:
    plot_df = df.copy()
    plot_df.reset_index(inplace=True)
    # make multiple line subplots for each country
    max_cols = 4
    n_rows = int(np.ceil(len(plot_df['Country'].unique())/max_cols))
    fig = make_subplots(rows=n_rows, cols=max_cols, subplot_titles=plot_df['Country'].unique(), shared_yaxes=True, shared_xaxes=True)
    fig.update_layout(title=title, legend_title='Answer')
    cur_row = 1
    cur_col = 1
    first = True
    new_row = True
    color_palette = px.colors.qualitative.Dark24
    if cols_to_plot is None:
        cols_to_plot = plot_df.columns.tolist()[4:]
    for country in plot_df['Country'].unique():
        df = plot_df.query('Country == @country')
        for i, col in enumerate(cols_to_plot):
            fig.add_trace(
                go.Scatter(x=df['Year'], y=df[col], name=col, mode='markers+lines', marker=dict(color=color_palette[i]), showlegend=first, legendgroup=col),
                row=cur_row, col=cur_col)
            fig.update_xaxes(title_text="Year", row=cur_row, col=cur_col)
            if new_row:
                fig.update_yaxes(title_text="Percentage", row=cur_row, col=cur_col)

        first = False
        new_row = False
        cur_col += 1
        if cur_col > max_cols:
            new_row = True
            cur_row += 1
            cur_col = 1

    layout_dict = {}
    for i in range(len(plot_df['Country'].unique())):
        if i == 0:
            layout_dict['xaxis_showticklabels'] = True
            layout_dict['yaxis_showticklabels'] = True
        else:
            layout_dict[f'xaxis{i}_showticklabels'] = True
            layout_dict[f'yaxis{i}_showticklabels'] = True
    fig.update_layout(layout_dict)

    fig.update_layout(margin=dict(l=20, r=20, t=100, b=20), height=600)
    return fig

## Attitude towards mobility

In [None]:
mobility_attitude_df = parse_statista_csv('../data/statista/Attitudes towards mobility.csv')
# filter for update 1
mobility_attitude_df = mobility_attitude_df.query('Update == 1')
mobility_attitude_df.info()

In [None]:
mobility_attitude_df

In [None]:
fig = create_bar_plot(mobility_attitude_df, 'Attitudes towards mobility')
fig.update_layout(height=900)
fig.show()

In [None]:
fig = create_scatter_plot(mobility_attitude_df, 'Attitudes towards mobility')
fig.update_layout(height=900)
fig.show()

In [None]:
# commute_countries = commute_duration_df.reset_index()['Country'].unique().tolist()
# frequent_modes_of_transport_countries = frequent_modes_of_transport_df.reset_index()['Country'].unique().tolist()
# commute_mode_countries = commute_modes_of_transport_df.reset_index()['Country'].unique().tolist()
# transportation_usage_public_countries = frequency_public_transport_df.reset_index()['Country'].unique().tolist()
# transportation_usage_cars_countries = frequency_car_df.reset_index()['Country'].unique().tolist()
# transportation_usage_combined_countries = frequency_combined_df.reset_index()['Country'].unique().tolist()
# # find common elements
# common_countries = list(set(commute_countries) & set(frequent_modes_of_transport_countries) & set(commute_mode_countries) & set(transportation_usage_public_countries) & set(transportation_usage_cars_countries) & set(transportation_usage_combined_countries))
# common_countries

## Duration of daily commute

In [None]:
commute_duration_df = parse_statista_csv('../data/statista/Duration of daily commute.csv')
# filter for update 1
commute_duration_df = commute_duration_df.query('Update == 1')
commute_duration_df.info()

In [None]:
commute_duration_df

In [None]:
fig = create_bar_plot(commute_duration_df, 'Duration of daily commute', barmode='stack')
fig.update_layout(height=1000)
fig.show()

In [None]:
fig = create_bar_plot(commute_duration_df, 'Duration of daily commute', barmode='group')
fig.update_layout(height=1000)
fig.show()

In [None]:
fig = create_scatter_plot(commute_duration_df, 'Duration of daily commute')
fig.update_layout(height=1000)
fig.show()


## Frequent users of modes of transportation

In [None]:
frequent_modes_of_transport_df = parse_statista_csv('../data/statista/Frequent users of modes of transportation.csv')
frequent_modes_of_transport_df = frequent_modes_of_transport_df.query('Update == 1')
frequent_modes_of_transport_df.info()

In [None]:
create_bar_plot(frequent_modes_of_transport_df, 'Frequent users of modes of transportation', barmode='group')

In [None]:
fig = create_bar_plot(frequent_modes_of_transport_df, 'Frequent users of modes of transportation', barmode='stack')
fig.update_layout(height=1000)
fig.show()

In [None]:
fig = create_scatter_plot(frequent_modes_of_transport_df, 'Frequent users of modes of transportation')
# fig.update_layout(height=700)
fig.show()

## Modes of transportation for commuting

In [None]:
commute_modes_of_transport_df = parse_statista_csv('../data/statista/Modes of transportation for commuting.csv')
commute_modes_of_transport_df = commute_modes_of_transport_df.query('Update == 1')
commute_modes_of_transport_df.info()

In [None]:
create_bar_plot(commute_modes_of_transport_df, 'Modes of transportation for commuting', barmode='group')

In [None]:
fig = create_bar_plot(commute_modes_of_transport_df, 'Modes of transportation for commuting', barmode='stack')
fig.update_layout(height=1000)

fig.show()

In [None]:
create_scatter_plot(commute_modes_of_transport_df, 'Modes of transportation for commuting')

## Bus user (local)

In [None]:
bus_user_df = parse_statista_csv('../data/statista/Bus user (local).csv')
bus_user_df = bus_user_df.query('Update == 1')
bus_user_df.info()

In [None]:
# fig = create_bar_plot(bus_user_df, 'Bus user (local)', barmode='group')
# fig.update_layout(height=900)
# fig.show()

In [None]:
# fig = create_bar_plot(bus_user_df, 'Bus user (local)', barmode='stack')
# fig.update_layout(height=900)
# fig.show()

In [None]:
fig = create_scatter_plot(bus_user_df, 'Bus user (local)')
fig.update_layout(height=900)
fig.show()

## Train user (local)

In [None]:
train_user_df = parse_statista_csv('../data/statista/Train user (local).csv')
train_user_df = train_user_df.query('Update == 1')
train_user_df.info()

In [None]:
# fig = create_bar_plot(train_user_df, 'Train user (local)', barmode='group')
# fig.update_layout(height=900)
# fig.show()

In [None]:
# fig = create_bar_plot(train_user_df, 'Train user (local)', barmode='stack')
# fig.update_layout(height=900)
# fig.show()

In [None]:
fig = create_scatter_plot(train_user_df, 'Train user (local)')
fig.update_layout(height=900)
fig.show()

## Transportation usage frequency Local public transportation

In [None]:
frequency_public_transport_df = parse_statista_csv('../data/statista/Transportation usage frequency Local public transportation.csv')
frequency_public_transport_df = frequency_public_transport_df.query('Update == 1')
frequency_public_transport_df.info()

In [None]:
create_bar_plot(frequency_public_transport_df, 'Transportation usage frequency Local public transportation', barmode='stack')

In [None]:
create_bar_plot(frequency_public_transport_df, 'Transportation usage frequency: Local public transportation', barmode='group')

In [None]:
fig = create_scatter_plot(frequency_public_transport_df, 'Transportation usage frequency: Local public transportation')
fig.update_layout(height=1000)
fig.show()

## Transportation usage frequency Own bicycle

In [None]:
frequency_bicycle_df = parse_statista_csv('../data/statista/Transportation usage frequency Own bicycle.csv')
frequency_bicycle_df = frequency_bicycle_df.query('Update == 1')
frequency_bicycle_df.info()

In [None]:
create_bar_plot(frequency_bicycle_df, 'Transportation usage frequency: Own bicycle', barmode='stack')

In [None]:
create_bar_plot(frequency_bicycle_df, 'Transportation usage frequency: Own bicycle', barmode='group')

In [None]:
fig = create_scatter_plot(frequency_bicycle_df, 'Transportation usage frequency: Own bicycle')
fig.update_layout(height=700)
fig.show()

## Transportation usage frequency Own or household car

In [None]:
frequency_car_df = parse_statista_csv('../data/statista/Transportation usage frequency Own or household car.csv')
frequency_car_df = frequency_car_df.query('Update == 1')
frequency_car_df.info()


In [None]:
create_bar_plot(frequency_car_df, 'Transportation usage frequency: Own or household car', barmode='stack')

In [None]:
create_bar_plot(frequency_car_df, 'Transportation usage frequency: Own or household car', barmode='group')

In [None]:
fig = create_scatter_plot(frequency_car_df, 'Transportation usage frequency: Own or household car')
fig.update_layout(height=1000)
fig.show()

## Compare frequency - public transport, bicycle and car

this is done by joining the required dataframes and plotting them. The columns to be plotted were limited to a subset to make the plots less cluttered.

In [None]:
frequency_combined_df = (
    frequency_public_transport_df.drop(columns="Base")
    .add_prefix("Public Transport ")
    .join(frequency_bicycle_df.drop(columns="Base").add_prefix("Bicycle "), how="outer")
    .join(frequency_car_df.drop(columns="Base").add_prefix("Car "), how="outer")
)
frequency_combined_df

In [None]:

# fig = create_scatter_plot(frequency_combined_df, 'Transportation usage frequency', cols_to_plot=[col for col in frequency_combined_df.columns.to_list() if 'daily' in col or 'Not at all' in col or '2-5 times per week' in col])
fig = create_scatter_plot(frequency_combined_df, 'Transportation usage frequency', cols_to_plot=[col for col in frequency_combined_df.columns.to_list() if 'daily' in col])
fig.update_layout(height=1000)
# fig.show(renderer='browser')

In [None]:
create_bar_plot(frequency_combined_df, 'Transportation usage frequency', cols_to_plot=[col for col in frequency_combined_df.columns.to_list() if 'daily' in col or 'Not at all' in col or '2-5 times per week' in col], barmode='stack')


In [None]:
fig = create_bar_plot(frequency_combined_df, 'Transportation usage frequency', cols_to_plot=[col for col in frequency_combined_df.columns.to_list() if 'daily' in col or 'Not at all' in col or '2-5 times per week' in col], barmode='group')
fig.show()