In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
sns.set_context('notebook')

from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')

In [None]:
# pd.set_option('max_columns', 200)

In [None]:
df = pd.read_csv('roller-coaster-db/coaster_db.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df = df[[
    'coaster_name', 
    # 'Length', 'Speed', 
    'Location', 'Status', 
    # 'Opening date', 'Type', 
    'Manufacturer', 
    # 'Height restriction', 'Model', 'Height',
    # 'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
    # 'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    # 'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    # 'Track layout', 'Fastrack available', 'Soft opening date.1',
    # 'Closing date', 'Opened', 
    # 'Replaced by', 'Website', 'Flash Pass Available', 'Must transfer from wheelchair', 
    # 'Theme', 'Single rider line available', 'Restraint Style',
    # 'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
    'year_introduced', 'latitude', 'longitude', 'Type_Main',
    'opening_date_clean', 
    # 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
    'speed_mph', 
    # 'height_value', 'height_unit', 
    'height_ft', 'Inversions_clean', 'Gforce_clean' 
]].copy()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.opening_date_clean = pd.to_datetime(df['opening_date_clean'])

In [None]:
df.rename({column: column.title() for column in df.columns}, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.loc[df.duplicated()]

In [None]:
df = df.loc[~df.duplicated(subset=['coaster_name', 'Location', 'opening_date_clean'])].reset_index(drop=True).copy()

In [None]:
year_introduced_series = df['year_introduced'].value_counts()

In [None]:
fig = plt.figure(figsize=(7, 4))
sns.lineplot(data=year_introduced_series, linewidth=2.3, alpha=0.7)
plt.title('Roller Costers Introduced Each Year')
plt.xlabel('Year', fontsize=11), plt.ylabel('Number of Costers', fontsize=11)
plt.xticks(fontsize=10), plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(7, 4))
sns.histplot(data=df['speed_mph'])
plt.title('Coaster Speed Distrebution')
plt.xlabel('Speed (mph)', fontsize=10), plt.ylabel('Freaquency', fontsize=10)
plt.xticks(fontsize=10), plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(7, 4))
sns.kdeplot(data=df['speed_mph'])
plt.title('Coaster Speed Distrebution')
plt.xlabel('Speed (mph)', fontsize=10), plt.ylabel('Freaquency', fontsize=10)
plt.xticks(fontsize=10), plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(7, 4))
# PLOT HERE
sns.scatterplot(data=df, x='speed_mph', y='height_ft', hue='Gforce_clean')
# PLOT HERE
plt.title('Coster Speed vs. Coster Height')
plt.xlabel('Speed (mph)', fontsize=10), plt.ylabel('Height (ft)', fontsize=10)
plt.xticks(fontsize=8), plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
df.columns

In [None]:
fig = plt.figure(figsize=(15, 9))
sns.pairplot(data=df, vars=['year_introduced', 'Inversions_clean', 'speed_mph', 'height_ft', 'Gforce_clean'], hue='Type_Main')
plt.tight_layout()
plt.show()

In [None]:
df_corr = df[['year_introduced', 'Inversions_clean', 'speed_mph', 'height_ft', 'Gforce_clean']].dropna().corr()
df_corr

In [None]:
fig = plt.figure(figsize=(8, 7))
sns.heatmap(df_corr, annot=True)
plt.tight_layout()
plt.show()

In [None]:
df.Location.value_counts()

In [None]:
fastest_costers = df.query('Location != "Other"') \
                    .reset_index() \
                    .groupby('Location')['speed_mph'] \
                    .agg(['mean', 'count']) \
                    .query('count >= 10') \
                    .copy()

fastest_costers.sort_values('mean', ascending=False, inplace=True)

In [None]:
fig = plt.figure(figsize=(10, 5))
# PLOT HERE
sns.barplot(data=fastest_costers, x='mean', y='Location', hue='count', palette='Set2', linewidth=1.2, saturation=0.7, alpha=0.8)
# PLOT HERE
plt.title('Title')
plt.xlabel('X Label', fontsize=10), plt.ylabel('Y Label', fontsize=10)
plt.xticks(fontsize=10), plt.yticks(fontsize=10)
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()