# EDA

In [None]:
# import the necessary libraries you need for your analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from matplotlib.ticker import PercentFormatter
from datetime import datetime as dt

# set general params
plt.rcParams.update({ "figure.figsize" : (15, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
# Floats (decimal numbers) should be displayed rounded with 1 decimal place
pd.set_option('display.float_format', lambda x: '%.1f' % x)
# Set style for plots
plt.style.use('fivethirtyeight') 

In [None]:
df = pd.read_csv('data/2_data.csv')

In [None]:
# display all available columns and open in separate file
for col in df.columns.sort_values():
    print(col)

# Outcome variable = State is succesful or not

In [None]:
df['state'].value_counts()

In [None]:
df[['state','usd_pledged']].describe()

In [None]:
sns.histplot(data=df, x='state',hue='state')
plt.show()

# Category distribution by project otucome 

In [None]:
# setup data 
bdata = df.query('state == "successful"')
axline_successful = bdata.groupby('category_main')['category_main'].count().mean()

b = sns.countplot(data=bdata, x='category_main', order = bdata['category_main'].value_counts().index,color='forestgreen')
b.set_xticklabels(b.get_xticklabels(),rotation = 45, size = 10)
b.axhline(y=axline_successful, color='black', linestyle ="--",linewidth = 1)
plt.xlabel(" ")
plt.ylabel(" ")
plt.show()

In [None]:
# setup data 
cdata = df.query('state == "failed"')
axline_unsuccessful = cdata.groupby('category_main')['category_main'].count().mean()

c = sns.countplot(data=bdata, x='category_main', order = cdata['category_main'].value_counts().index,color='indianred')
c.set_xticklabels(b.get_xticklabels(),rotation = 45, size = 10)
c.axhline(y=axline_unsuccessful, color='black', linestyle ="--",linewidth = 1)
plt.xlabel(" ")
plt.ylabel(" ")
plt.show()

In [None]:
# Best subcategories
# setup data 
# setup data 
ddata = df.query('state == "successful"')
axline_successful2 = bdata.groupby('category_sub')['category_sub'].count().mean()

d = sns.countplot(data=ddata, x='category_sub', order = ddata['category_sub'].value_counts().index,color='forestgreen')
d.set_xticklabels(d.get_xticklabels(),rotation = 45, size = 10)
b.axhline(y=axline_successful2, color='black', linestyle ="--",linewidth = 1)
plt.xlabel(" ")
plt.ylabel(" ")
plt.show()

# Length of title and description

In [None]:
sns.displot(data=df, x='name_length',col='state',hue='state')

In [None]:
sns.displot(data=df, x='description_length',col='state',hue='state')

# Number of projects over time

In [None]:
df['date_launch'] = pd.to_datetime(df['date_launch'])
df['date_end'] = pd.to_datetime(df['date_end'])
df['date_end'].head(2)

In [None]:
# format launch date to month
df['date_launch_month'] = pd.to_datetime(df['date_launch']).dt.to_period('M')
df['date_launch_month'].head(2)

In [None]:
t = sns.countplot(data=df,x='date_launch_month',color='steelblue')
t.set_xticklabels(t.get_xticklabels(),rotation = 90, size = 10)
plt.show()

In [None]:
# format launch date to quarter
df['date_launch_quarter'] = pd.to_datetime(df['date_launch']).dt.to_period('Q')
df['date_launch_quarter'].head(2)

In [None]:
# Quarterly projects 
ma = df[['date_launch_quarter','id']].groupby('date_launch_quarter').count().reset_index().sort_values('date_launch_quarter')
ma2 = ma['id'].mean()

In [None]:
t = sns.countplot(data=df,x='date_launch_quarter', color='steelblue', order=ma['date_launch_quarter'])
t.set_xticklabels(t.get_xticklabels(),rotation = 90, size = 10)
t.axhline(y=ma2, color='black', linestyle ="--",linewidth = 1)
plt.xlabel(" ")
plt.ylabel(" ")
plt.show()

In [None]:
df.head()

In [None]:
df['duration'] = df['date_end']-df['date_launch']
df['duration'] = df['duration'].dt.round('d').dt.days # Rounding to nearest days, then showing as number only
df['duration'].head()

In [None]:

# calculcate avg
avg_duration = df['duration'].mean()
avg_duration

In [None]:
# create duration histogram without avg. line 
# create plot 
t1 = sns.histplot(data=df, x='duration',hue='state',binwidth=3)
plt.xlabel(" ")
plt.ylabel(" ")
plt.title("Distribution of the project duration by project state")
plt.show()


In [None]:
# Launch day of week
df['launch_day'] = df['date_launch'].dt.weekday
#df['launch_day_name'] = df['date_launch'].dt.weekday_name does not exist, takes too much time to code now
df[['launch_day']].head()

In [None]:
t2 = sns.countplot(data=df,x='launch_day',hue='state')
plt.xlabel("Day of week Monday - Sunday")
plt.ylabel(" ")
plt.title("Distribution of the project launch by day of week")
plt.show()

In [None]:
# Launch month
df['launch_month'] = df['date_launch'].dt.month
df['launch_month_name'] = df['date_launch'].dt.month_name()
df[['launch_month','launch_month_name']].head()

In [None]:
t3 = sns.countplot(data=df,x='launch_month',hue='state')
plt.xlabel(" ")
plt.ylabel(" ")
plt.title("Distribution of the project launch by month")
plt.show()