# Eurostat

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# Assuming your data is stored in a CSV file named 'mobility_data.csv'
data = pd.read_excel('../data/eurostat/transport.xlsx')


In [None]:
data.head(3)

In [None]:
data.info()

In [None]:
# Check for missing values
data.isnull().sum()

Since we have lots of missing values and do not want to influence the data by filling in, we have decided to replace the missing values with "unknown"

In [None]:
data = data.fillna('unknown')

In [None]:
data.isnull().sum()

In [None]:
data = data.drop('Covid impact', axis=1)


In [None]:
data.head()

In [None]:
country_sample_size = data.groupby('Country')['Sample size'].sum()
print(country_sample_size)

The comparison of sample sizes in different countries is an indication that we do not have equality in sample sizes.

In [None]:
# Initialize an empty DataFrame to store the subsamples
subsamples = pd.DataFrame()

# Calculate the minimum sample size across all countries
min_sample_size = data.groupby('Country')['Sample size'].sum().min()


# Loop over each country
for country in data['Country'].unique():
    # Get the data for this country
    country_data = data[data['Country'] == country]
    
    # Check if the country's data size is larger than min_sample_size
    if len(country_data) >= min_sample_size:
        # Sample min_sample_size observations from this country's data
        subsample = country_data.sample(n=min_sample_size)
    else:
        # If not enough data, take all available data for this country
        subsample = country_data.copy()
        
    # Append the subsample to the subsamples DataFrame
    subsamples = pd.concat([subsamples, subsample])

# Now, 'subsamples' is a DataFrame containing subsamples from each country


In [None]:
# Display the unique values in the 'Sub type' column
print(data['Sub type'].unique())


In [None]:
# Display the unique values in the 'Type' column
print(data['Type'].unique())


In [None]:
# Create new columns based on 'Type' and 'Sub type'
data['fuel type'] = data['Sub type'][data['Type'] == 'fuel type']
data['mode'] = data['Sub type'][data['Type'] == 'mode']
data['purpose'] = data['Sub type'][data['Type'] == 'purpose']

# Drop the original 'Type' column
data = data.drop('Type', axis=1)


In [None]:
data.head(50)

In [None]:
# Dropping the 'Sub type' column
#data.drop('Sub type', axis=1, inplace=True)

In [None]:
# Display the list of columns
columns_list = data.columns
print(columns_list)


In [None]:
# Assuming 'data' is your DataFrame with columns 'Type', 'Sub type', 'Gender', and 'Estimator'
average_amounts = data.groupby(['Country', 'purpose', 'Gender'])['Estimator'].mean().unstack().reset_index()

# Iterate through unique 'Type' values
for type_value in average_amounts['Country'].unique():
    type_data = average_amounts[average_amounts['Country'] == type_value]

    # Plot stacked bar chart
    type_data.set_index('purpose').plot(kind='bar', stacked=True, figsize=(10, 6))

    plt.title(f'Average Estimator for purpose in {type_value}')
    plt.xlabel('purpose')
    plt.ylabel('Average Estimator')
    plt.legend(title='Gender', loc='upper right', bbox_to_anchor=(1.3, 1))
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Calculate the average number of trips for each mode of transport in each country
average_trips = data.groupby(['Country', 'Sub type'])['Estimator'].mean().unstack()

# Plot a stacked bar chart
average_trips.plot(kind='bar', stacked=True, figsize=(14, 12))

plt.title('Average Number of Trips per Mode of Transport in Each Country')
plt.xlabel('Country')
plt.ylabel('Average Number of Trips')
plt.legend(title='Mode of Transport')
plt.show()


In [None]:
import seaborn as sns

# Group the data by 'Indicator' and 'mode' and calculate the mean of 'Estimator'
average_estimator = data.groupby(['Indicator', 'mode'])['Estimator'].mean().reset_index()

# Create a figure and axis
plt.figure(figsize=(10, 6))

# Create a bar plot
sns.barplot(x='Indicator', y='Estimator', hue='mode', data=average_estimator)

plt.title('Average Estimator for Each Indicator and Mode of Transport')
plt.xlabel('Indicator')
plt.ylabel('Average Estimator')
plt.legend(title='Mode of Transport')

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45)

plt.show()


In [None]:
# Assuming 'data' is your DataFrame with columns 'Type', 'Sub type', 'Gender', and 'Estimator'
average_amounts = data.groupby(['purpose', 'Gender', 'Type of day'])['Estimator'].mean().unstack().reset_index()

# Iterate through unique 'Type' values
for type_value in average_amounts['Gender'].unique():
    type_data = average_amounts[average_amounts['Gender'] == type_value]

    # Plot stacked bar chart
    type_data.set_index('purpose').plot(kind='bar', stacked=True, figsize=(10, 6))

    plt.title(f'Average Estimator for purpose in {type_value}')
    plt.xlabel('Purpose')
    plt.ylabel('Average Estimator')
    plt.legend(title='Type of day', loc='upper right', bbox_to_anchor=(1.3, 1))
    plt.show()

In [None]:
# Assuming 'data' is your DataFrame with columns 'Type', 'Sub type', 'Gender', and 'Estimator'
average_amounts = data.groupby(['Type of day', 'Country', 'fuel type'])['Estimator'].mean().unstack().reset_index()

# Iterate through unique 'Type' values
for type_value in average_amounts['Type of day'].unique():
    type_data = average_amounts[average_amounts['Type of day'] == type_value]

    # Plot stacked bar chart
    type_data.set_index('Country').plot(kind='bar', stacked=True, figsize=(10, 6))

    plt.title(f'Average Estimator for type of day in {type_value}')
    plt.xlabel('Country')
    plt.ylabel('Average Estimator')
    plt.legend(title='fuel type', loc='upper right', bbox_to_anchor=(1.3, 1))
    plt.show()

In [None]:
# Assuming 'data' is your DataFrame with columns 'Type', 'Sub type', 'Gender', and 'Estimator'
average_amounts = data.groupby(['Type of day', 'Country', 'purpose'])['Estimator'].mean().unstack().reset_index()

# Iterate through unique 'Type' values
for type_value in average_amounts['Type of day'].unique():
    type_data = average_amounts[average_amounts['Type of day'] == type_value]

    # Plot stacked bar chart
    type_data.set_index('Country').plot(kind='bar', stacked=True, figsize=(10, 6))

    plt.title(f'Average Estimator for type of day in {type_value}')
    plt.xlabel('Country')
    plt.ylabel('Average Estimator')
    plt.legend(title='purpose', loc='upper right', bbox_to_anchor=(1.3, 1))
    plt.show()

In [None]:
# Assuming 'data' is your DataFrame with columns 'Type', 'Sub type', 'Gender', and 'Estimator'
average_amounts = data.groupby(['Type of day', 'Country', 'mode'])['Estimator'].mean().unstack().reset_index()

# Iterate through unique 'Type' values
for type_value in average_amounts['Type of day'].unique():
    type_data = average_amounts[average_amounts['Type of day'] == type_value]

    # Plot stacked bar chart
    type_data.set_index('Country').plot(kind='bar', stacked=True, figsize=(10, 6))

    plt.title(f'Average Estimator for type of day in {type_value}')
    plt.xlabel('Country')
    plt.ylabel('Average Estimator')
    plt.legend(title='mode', loc='upper right', bbox_to_anchor=(1.3, 1))
    plt.show()

In [None]:
# Set the style for the plot
sns.set(style="whitegrid")

# Plotting all 'Sub type' categories and their average 'Estimator' values
plt.figure(figsize=(14, 8))
sns.barplot(x='Estimator', y='purpose', data=data, ci=None, palette='viridis')
plt.xlabel('Average Estimator')
plt.ylabel('Purpose')
plt.title('Average Estimator for Each Purpose')
plt.show()


In [None]:
# Filter out rows where 'Estimator' is not applicable
filtered_data = data[data['Estimator'].notnull()]

# Create a grouped bar plot for 'Estimator' based on 'MS' and 'Gender'
plt.figure(figsize=(14, 8))
sns.barplot(x='Country', y='Estimator', hue='Gender', data=filtered_data, ci=None, palette='muted', dodge=True)
plt.xlabel('Country (Countries)')
plt.ylabel('Average Estimator')
plt.title('Average Estimator for Country (Countries) and Gender')
plt.legend(title='Gender')
plt.show()


In [None]:
# Filter out rows where 'Estimator' is not applicable
filtered_data = data[data['Estimator'].notnull()]

# Create a stacked bar plot for the distribution of main travel modes with 'Gender' as hue
plt.figure(figsize=(14, 8))
sns.barplot(x='Estimator', y='mode', hue='Gender', data=filtered_data, ci=None, palette='muted', estimator=sum)
plt.xlabel('Main Travel Mode')
plt.ylabel('Total Estimator')
plt.title('Distribution of Main Travel Modes by Gender')
plt.legend(title='Gender')
plt.show()


In [None]:
# Display the statistical summary of numeric columns in the dataset
numeric_summary = data.describe(include='number')
numeric_summary


In [None]:
# Count occurrences of each unique value in the 'Gender' column
gender_counts = data['Gender'].value_counts()

# Display the result
gender_counts


In [None]:
# Specify the columns for analysis
columns_to_analyze = ['Type of day', 'Urban or Short-distance mobility', 'Gender','Age band']

# Create subplots based on the number of specified columns
fig, axes = plt.subplots(nrows=len(columns_to_analyze), ncols=1, figsize=(10, 6 * len(columns_to_analyze)))

# Iterate through each specified column and create a bar plot
for i, column in enumerate(columns_to_analyze):
    # Count occurrences of each unique value
    column_counts = data[column].value_counts()

    # Calculate percentages
    column_percentages = (column_counts / column_counts.sum()) * 100

    # Create a bar plot
    sns.barplot(x=column_percentages.index, y=column_percentages.values, palette='muted', ax=axes[i])

    # Display percentage values on top of each bar
    for bar, percentage in zip(axes[i].patches, column_percentages.values):
        height = bar.get_height()
        axes[i].text(bar.get_x() + bar.get_width() / 2, height + 1, f'{percentage:.2f}%', ha='center', va='bottom')

    # Set labels and title for each subplot
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Percentage')
    axes[i].set_title(f'Percentage Distribution of {column}')

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()


In [None]:
# Create a count plot for the distribution of travel purpose by age class
plt.figure(figsize=(14, 8))
sns.countplot(x='Type of day', hue='Age band', data=data, palette='viridis')
plt.xlabel('Age Class')
plt.ylabel('Count')
plt.title('Distribution of Travel Purpose by Age Class')
plt.legend(title='Travel Purpose', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
