In [None]:
# https://www.statology.org/pandas-groupby-plot/ 
# https://www.tutorialspoint.com/matplotlib/index.htm
# https://www.geeksforgeeks.org/python-seaborn-tutorial/?ref=lbp
# https://www.w3schools.com/python/matplotlib_intro.asp

In [None]:
# A sample of Project1

In [None]:
# importing the required library
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
# Data Preparation

In [None]:
# Reading in the file, make sure file is in same directory as this python notebook
df = pd.read_csv("Film_Permits.csv")
df.head()

In [None]:
df.columns

In [None]:
# Only acccessing needed columns and making it accordingly to data requirements

In [None]:
# Setting up index for my dataframe
df.index = df["EventID"]

In [None]:
df = df[['EventType', 'StartDateTime', 'EndDateTime', 'EnteredOn', 'Borough', 'Category', 'SubCategoryName']]
df

In [None]:
# This is how to convert object datatype to datetime in pandas, if you need to do same you can review it here
# I am doing it because i am going to do some calculations later between this calculations
# https://www.geeksforgeeks.org/python-pandas-to_datetime/
df["StartDateTime"] = pd.to_datetime(df["StartDateTime"])
df["EndDateTime"] = pd.to_datetime(df["EndDateTime"])
df["EnteredOn"] = pd.to_datetime(df["EnteredOn"])
df.head()

In [None]:
# Since I don't have numeric datatype column in my dataset, I am going to create two columns
# I think duration of event can be something useful to look into so i will create that column
# Also, How much time between filed and event start can also be a good calculation to look into
df["EventDuration"] = df["EndDateTime"] - df["StartDateTime"]
df["ApplicationTimeDiff"] = df["StartDateTime"] - df["EnteredOn"]
df.head()

In [None]:
df.head()

In [None]:
# This is how you get unique values in columns
df["EventType"].unique()

In [None]:
# Counting Category and SubCategory Observations in the data
df[["Category","SubCategoryName"]].value_counts()

In [None]:
df["SubCategoryName"].unique()

In [None]:
# Looking on descriptive stat you can do count and mean differently or use describe function on a column

In [None]:
df[["EventDuration","ApplicationTimeDiff"]].describe()

In [None]:
df.groupby("Borough")[
       "Borough"].count()

In [None]:
df.groupby("Borough")[["EventDuration",
                       "ApplicationTimeDiff"]].mean()

In [None]:
# Pandas has inbuilt plot functionality, which uses matplotlib library so you can call plot commands or
# use matplotlib by self

In [None]:
brough_count = df.groupby("Borough")["Borough"].count()
brough_count.plot.pie(autopct='%.2f%%', title="MOME Applications by Borough") 
# The proportionate percentage is displayed inside the 
# respective wedge with the help of autopct parameter 

In [None]:
borough_eventype_grp = df.groupby(["Borough","EventType"])["EventDuration"].count().reset_index()
# This is count of EventDuration
borough_eventype_grp 

In [None]:
# Creating a bar chart for count of application by Borough by Event Type
sns.barplot(x = "Borough",y="EventDuration",hue="EventType",data=borough_eventype_grp, )
plt.show()
# This graph is not so good as it doesn't add up any conclusion
# So we will skip this one

In [None]:
# This is how to convert from time column to a int column(int is microseconds of time)
df.groupby(["Borough","Category"])["EventDuration"].mean().astype(int)

In [None]:
# Lets try with Category and Borough
# Creating a bar chart for mean event duration of application by Borough by Category
borough_category_grp = df.groupby(["Borough","Category"])["EventDuration"].count().reset_index()
borough_category_grp_chart = sns.barplot(x = "Category",y="EventDuration",hue="Borough",
                                        data=borough_category_grp,)
borough_category_grp_chart.set_xticklabels( borough_category_grp_chart.get_xticklabels(), 
                                        rotation=75, horizontalalignment='right')
plt.show()


In [None]:
# Same chart with different command

In [None]:
fig = sns.countplot(x = "Category",hue="Borough", data=df)
fig.set_xticklabels( fig.get_xticklabels(), rotation=75, horizontalalignment='right')
fig.set(title="Application over Category by Borough")
plt.show()

In [None]:
# Lets try with Category and Borough
# Creating a bar chart for mean event duration of application by Borough by Category
category_sub_grp = df.groupby(["Category","SubCategoryName"])["EventDuration"].mean().astype(int).reset_index()
category_sub_grp_chart = sns.barplot(x = "Category",y="EventDuration",hue="SubCategoryName",
                                        data=category_sub_grp,)
category_sub_grp_chart.set_xticklabels( category_sub_grp_chart.get_xticklabels(), 
                                        rotation=75, horizontalalignment='right')
plt.show()
# This is bad chart so we gonna skip this as this chart doesn't help us 

In [None]:
# Lets create Pie Chart for Category to see which category has most applications, this time we will create without pd.plot
cnt_category = df.groupby(["Category"]).size()
plt.pie(cnt_category, labels=cnt_category.index,autopct='%.2f%%',)
plt.title("MOME Applications by Categories")
plt.show()

In [None]:
# Let's try to look at average EventDuration and Application Time Diff by Category
df.groupby(["Category"])[["EventDuration", "ApplicationTimeDiff"
        ]].mean().sort_values(by=["EventDuration","ApplicationTimeDiff"])

In [None]:
# Checking normality of numerical data in set with histogram chart
# Since my data doesn't have numeric column I am going to convert EventDuration to int for this calculation
# SInce my int column is going to be very big I am going to normalize that column by dividing it with 3600000000 to 
# convert back in hours and round up to 4 decimal points
plt.hist((df["EventDuration"].astype(int)))
plt.show()

In [None]:
# I am going to filter data here as Some Event Duration might be problematic
df["EventDuration"].describe()

In [None]:
# So I will filter my data here by Upper limt of 95% Confidence Interval,
# You can also filter by mean+- 3 std_dv, which covers 97% of confidence interval, 
# I am doing 95% confidence interval which is +-2 std_dv
# We know boolean Indexing works in pandas so just using to access original dataframe
upper_interval = df["EventDuration"].mean() + df["EventDuration"].std()*2
filter_df = df[df["EventDuration"]<upper_interval].copy()
filter_df.head()

In [None]:
# To chaeck normality of data, I will creat a histogram chart here
filter_df["EventDuration"].astype(int).plot.hist(bins=100,
                                            title="Event Duration Histogram(MicroSeconds)")

In [None]:
filter_df["ApplicationTimeDiff"].astype(int).plot.hist(bins=1000,
                                    title="Application Histogram(MicroSeconds)")

In [None]:
# Scatter Plot using seaborn to check relation between ApplicationTimeDiff and EventDuration

In [None]:
sns.scatterplot(x=filter_df["EventDuration"].astype(int)/3600000000000,
            y= filter_df["ApplicationTimeDiff"].astype(int)/3600000000000,
            hue=filter_df["Borough"])
plt.show() # In Hours