In [23]:
# Importamos nuestras librerías a usar

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from scipy.stats import linregress
from scipy.stats import ttest_1samp
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind
import statsmodels.api as sm

In [2]:
# Leémos el dataframe
data = pd.read_csv('ks-projects.csv')

In [3]:
# Análizamos que columnas y datos trae
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [5]:
# Haremos un subset de los proyectos que realmente fueron fondeados
# Con el fin de solo trabajar sobre los que fueron exitosos
# Revisamos que tipo de estatus tienen los proyectos
data.state.unique()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [12]:
# Filtramos por los proyectos que realmente fueron exitosos
successful_projects = data[data['state'] == 'successful']
successful_projects['state'].unique() # Rectificamos con el método unique()

array(['successful'], dtype=object)

In [11]:
# Revisamos nuestra información
successful_projects

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.00,successful,224,US,52375.00,52375.00,50000.00
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:44,1205.00,successful,16,US,1205.00,1205.00,1000.00
11,100005484,Lisa Lim New CD!,Indie Rock,Music,USD,2013-04-08,12500.0,2013-03-09 06:42:58,12700.00,successful,100,US,12700.00,12700.00,12500.00
14,1000057089,Tombstone: Old West tabletop game and miniatur...,Tabletop Games,Games,GBP,2017-05-03,5000.0,2017-04-05 19:44:18,94175.00,successful,761,GB,57763.78,121857.33,6469.73
18,1000070642,Mike Corey's Darkness & Light Album,Music,Music,USD,2012-08-17,250.0,2012-08-02 14:11:32,250.00,successful,7,US,250.00,250.00,250.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378642,999929142,ÉPOUVANTAILS : 28mm Figurines de jeux pour KIN...,Tabletop Games,Games,EUR,2017-10-31,1000.0,2017-10-04 11:26:44,1246.00,successful,35,FR,66.72,1452.47,1165.70
378644,999934908,The Manual Bar Blade,Product Design,Design,USD,2015-12-15,3500.0,2015-11-23 07:33:14,6169.00,successful,120,US,6169.00,6169.00,3500.00
378646,999943841,The Dog Coffee Book,Children's Books,Publishing,USD,2013-11-30,950.0,2013-10-18 21:35:04,1732.02,successful,31,US,1732.02,1732.02,950.00
378651,999969812,AT THE BEACH,Classical Music,Music,CAD,2014-03-22,5000.0,2014-02-20 01:00:16,5501.00,successful,78,CA,5019.92,4983.69,4529.81


### Hypothesis Test

Realizaremos un hypothesis test, usando como hipótesis alternativa: "Que los proyectos exitósos fueron respaldados por una media de personas mayor a 500". Ya que, creemos que en Kickstarter se levanta capital en volumen más que en solo algunos individuos.

Y como hipótesis nula, tenemos que: "La media de backers es menor a 500". 

In [13]:
# Obtenemos algúnas métricas estadísticas que nos servirán para realizar el análisis a través de un describe()
successful_projects.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,133956.0,133956.0,133956.0,133956.0,133851.0,133956.0,133956.0
mean,1073727000.0,10162.96,24099.78,263.921362,17356.03,22670.8,9532.853
std,619151400.0,31248.29,158471.3,1504.580142,130122.8,151090.3,27961.44
min,21109.0,0.01,1.0,0.0,0.0,0.79,0.01
25%,535444300.0,1250.0,1977.75,33.0,1162.0,2000.0,1301.915
50%,1076595000.0,3923.0,5117.0,71.0,3655.66,5107.25,3837.74
75%,1608161000.0,10000.0,13440.25,167.0,10426.0,13232.08,10000.0
max,2147476000.0,2000000.0,20338990.0,219382.0,20338990.0,20338990.0,2015609.0


In [19]:
# Corremos el Hypothesis Test

backers = np.random.normal(263.92, 1504.58, 133956)
ttest_1samp(backers, 500) 

Ttest_1sampResult(statistic=-57.798030519491455, pvalue=0.0)

Con un 95% de Nivel de Confianza, aceptámos la hipótesis nula, la cual indica que la media de backers es menor a 500. 

### Two Sample Hypothesis Tests with Scipy


In [21]:
# Aplicaremos el ttest_rel para evaluar antes del fondeo (la meta)
# Vs el después que fue la meta alcanzada (la meta real)

ttest_rel(successful_projects.goal, successful_projects.usd_goal_real)

Ttest_relResult(statistic=15.824359743308838, pvalue=2.3776238714100174e-56)

### Independent Samples

Para esta análisis, evaluaremos la diferencia existente entre el USD y el Euro.

In [22]:
usd_projects = successful_projects[successful_projects['currency'] == 'USD']
eur_projects = successful_projects[successful_projects['currency'] == 'EUR']

In [25]:
ttest_ind(usd_projects.usd_goal_real, eur_projects.usd_goal_real, equal_var=True)

Ttest_indResult(statistic=-6.039220811047762, pvalue=1.5534020762136159e-09)

### Linear Regression

In [26]:
X = sm.add_constant(successful_projects.goal)
Y = successful_projects.usd_goal_real

model = sm.OLS(Y,X).fit()
predictions = model.predict(X)

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:          usd_goal_real   R-squared:                       0.783
Model:                            OLS   Adj. R-squared:                  0.783
Method:                 Least Squares   F-statistic:                 4.822e+05
Date:                Tue, 13 Jul 2021   Prob (F-statistic):               0.00
Time:                        23:14:05   Log-Likelihood:            -1.4594e+06
No. Observations:              133956   AIC:                         2.919e+06
Df Residuals:                  133954   BIC:                         2.919e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1487.9524     37.459     39.722      0.0