In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [3]:
#load dataset and parse dates
ks18 = pd.read_csv('/content/ks-projects-201801.csv', parse_dates=['deadline', 'launched'])

In [4]:
ks18

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.00
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.00
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.00
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,1,US,25.0,25.0,50000.00
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.00
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.00
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.00


In [5]:
#check column names
print("\ncolumn names:\n", ks18.columns)


column names:
 Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'goal', 'launched', 'pledged', 'state', 'backers', 'country',
       'usd pledged', 'usd_pledged_real', 'usd_goal_real'],
      dtype='object')


In [6]:
#create a binary column of outcome 
ks18 = ks18.assign(outcome=(ks18['state'] == 'successful').astype(int))

In [7]:
# Timestamp features
ks18 = ks18.assign(hour=ks18.launched.dt.hour,
               day=ks18.launched.dt.day,
               month=ks18.launched.dt.month,
               year=ks18.launched.dt.year)

In [8]:
#Percentage of USD Pledged / USD Goal
ks18['percentage'] = ks18['usd_pledged_real']/ks18['usd_goal_real'] *100

In [9]:
# Label encoding
cat_features = ['category', 'main_category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks18[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
data = ks18[data_cols].join(encoded)

In [11]:
# get the number of missing data points per column
missing_values_count = ks18.isnull().sum()

# look at the # of missing points in the first 17 columns
missing_values_count[0:16]

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
outcome                0
dtype: int64

In [12]:
#display rows with missing values
null_data = ks18[ks18.isnull().any(axis=1)]
null_data

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,...,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year,percentage
169,1000694855,STREETFIGHTERZ WHEELIE MURICA,Film & Video,Film & Video,USD,2014-09-20,6500.0,2014-08-06 21:28:36,555.00,undefined,...,"N,0""",,555.00,6500.00,0,21,6,8,2014,8.538462
328,100149523,Duncan Woods - Chameleon EP,Music,Music,AUD,2015-08-25,4500.0,2015-08-04 12:05:17,4767.00,undefined,...,"N,0""",,3402.08,3211.53,0,12,4,8,2015,105.933309
632,1003023003,The Making of Ashley Kelley's Debut Album,Music,Music,USD,2015-04-09,3500.0,2015-03-10 20:06:13,3576.00,undefined,...,"N,0""",,3576.00,3500.00,0,20,10,3,2015,102.171429
647,1003130892,Butter Side Down Debut Album,Music,Music,USD,2015-11-26,6000.0,2015-11-02 22:09:19,7007.80,undefined,...,"N,0""",,7007.80,6000.00,0,22,2,11,2015,116.796667
749,1003629045,Chase Goehring debut EP,Music,Music,USD,2016-03-21,3000.0,2016-02-23 03:09:49,3660.38,undefined,...,"N,0""",,3660.38,3000.00,0,3,23,2,2016,122.012667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378233,997971307,"EQUUS International Film Festival®, all-equine...",Film & Video,Film & Video,USD,2015-02-03,7500.0,2014-12-05 04:19:14,10.00,undefined,...,"N,0""",,10.00,7500.00,0,4,5,12,2014,0.133333
378303,998319149,Emily 2050 - Short Film,Film & Video,Film & Video,CAD,2014-05-23,3000.0,2014-04-08 00:30:09,3102.00,undefined,...,"N,0""",,2845.61,2752.04,0,0,8,4,2014,103.400023
378434,9988744,Matthew Stephens Music,Music,Music,USD,2016-02-05,5000.0,2016-01-06 21:59:23,235.00,undefined,...,"N,0""",,235.00,5000.00,0,21,6,1,2016,4.700000
378585,999610349,Lady Vendredi: Afrofuturist concept 12 inch EP,Music,Music,GBP,2015-10-19,2000.0,2015-09-21 22:33:18,2125.00,undefined,...,"N,0""",,3273.36,3080.81,0,22,21,9,2015,106.249980


In [13]:
#total number of observations
len(ks18.index)

378661

In [14]:
# total missing values
total_cells = len(ks18.index)
total_missing = missing_values_count.sum()

# percent of data that is missing
(total_missing/total_cells) * 100

1.003800232926021

In [15]:
#drop rows with missing values
ks18_drop = ks18.dropna()

In [16]:
# check number of missing data points per column after dropping missing values
missing_values_count = ks18_drop.isnull().sum()

# look at the # of missing points in the first 17 columns
missing_values_count[0:16]

ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
outcome             0
dtype: int64

In [17]:
# counting unique values
n = len(pd.unique(ks18_drop['ID']))
  
print("No.of.unique values :", 
      n)

No.of.unique values : 374860


In [18]:
#check duplicated items
#check if the total number of observations matches with the number of unique values
len(ks18_drop.index)

374860

In [19]:
#check dtypes
ks18_drop.dtypes

ID                           int64
name                        object
category                    object
main_category               object
currency                    object
deadline            datetime64[ns]
goal                       float64
launched            datetime64[ns]
pledged                    float64
state                       object
backers                      int64
country                     object
usd pledged                float64
usd_pledged_real           float64
usd_goal_real              float64
outcome                      int64
hour                         int64
day                          int64
month                        int64
year                         int64
percentage                 float64
dtype: object

In [20]:
#check if all values are numeric in these columns
# convert non-numeric values to missing values
ks18_drop['usd_goal_real'] = ks18_drop['usd_goal_real'].replace(regex='|[^\d+]', value=np.nan)
ks18_drop['usd_pledged_real'] = ks18_drop['usd_pledged_real'].replace(regex='|[^\d+]', value=np.nan)
ks18_drop['usd pledged'] = ks18_drop['usd pledged'].replace(regex='|[^\d+]', value=np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [21]:
# no missing values, i.e. no non-numeric values in above mentioned columns
missing_values_count = ks18_drop.isnull().sum()
missing_values_count[0:17]

ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
outcome             0
hour                0
dtype: int64

In [32]:
# mix-max scale the data between 0 and 1
min_max_scaler = MinMaxScaler()
ks18_drop[["usd_goal_real"]] = min_max_scaler.fit_transform(ks18_drop[["usd_goal_real"]])
ks18_drop[["usd_pledged_real"]] = min_max_scaler.fit_transform(ks18_drop[["usd_pledged_real"]])
ks18_drop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,...,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year,percentage
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,...,GB,0.0,0.000000e+00,0.000009,0,12,11,8,2015,0.000000
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,...,US,100.0,1.190325e-04,0.000180,0,4,2,9,2017,8.070000
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,...,US,220.0,1.081666e-05,0.000270,0,0,12,1,2013,0.488889
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,...,US,1.0,4.916666e-08,0.000030,0,3,17,3,2012,0.020000
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,...,US,1283.0,6.308082e-05,0.000117,0,8,4,7,2015,6.579487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,...,US,25.0,1.229166e-06,0.000301,0,2,17,9,2014,0.050000
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,...,US,155.0,7.620832e-06,0.000009,0,3,22,6,2011,10.333333
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,...,US,20.0,9.833332e-07,0.000090,0,19,1,7,2010,0.133333
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,...,US,200.0,9.833332e-06,0.000090,0,18,13,1,2016,1.333333


In [33]:
ks18_drop

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,...,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year,percentage
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,...,GB,0.0,0.000000e+00,0.000009,0,12,11,8,2015,0.000000
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,...,US,100.0,1.190325e-04,0.000180,0,4,2,9,2017,8.070000
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,...,US,220.0,1.081666e-05,0.000270,0,0,12,1,2013,0.488889
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,...,US,1.0,4.916666e-08,0.000030,0,3,17,3,2012,0.020000
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,...,US,1283.0,6.308082e-05,0.000117,0,8,4,7,2015,6.579487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,...,US,25.0,1.229166e-06,0.000301,0,2,17,9,2014,0.050000
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,...,US,155.0,7.620832e-06,0.000009,0,3,22,6,2011,10.333333
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,...,US,20.0,9.833332e-07,0.000090,0,19,1,7,2010,0.133333
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,...,US,200.0,9.833332e-06,0.000090,0,18,13,1,2016,1.333333
