# Data cleaning
This script relates to the kickstarter dataset - please run this only after the setup is complete.
It changes column names, drops unnecessary features and checks the currency columns for validity.
Collaborators: Philippa Schindler

In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv('data/1_working_data.csv')

## Renaming

In [None]:
# rename blurb to description and goal to goal_orig
df.rename(columns={'blurb':'description','pledged':'pledged_orig','goal':'goal_orig'},inplace=True)

In [None]:
# add prefix date_* to launched_at, created_at, deadline, state_changed_at
df.rename(columns={'launched_at':'date_launch', 'created_at':'date_creation', 'deadline':'date_end', 'state_changed_at':'date_status_change'}, inplace=True)


## Drop

In [None]:
# drop unnecessary columns
df_drop = df.drop(["Unnamed: 0",'currency_symbol','converted_pledged_amount','currency_trailing_code',
    'friends', 'fx_rate','is_backing', 'is_starrable', 'is_starred','permissions','slug','source_url','usd_type'], axis = 1)

In [None]:
#drop duplicate rows & status == fail or successful 
df = df.query('(state == "failed") or (state == "successful")')
df.drop_duplicates(subset = 'id', inplace=True)

In [None]:
# quality check
df.head()

## Split category

In [None]:
df[['category_main','category_sub']] = df['category_slug'].str.split('/', expand=True)
df.head()

In [None]:
# Quality check 
df['category_main'].isna().value_counts()

In [None]:
df['category_sub'].isna().value_counts()

In [None]:
# Fill missing sub cats as miscallenous
df['category_sub'].fillna('Misc',inplace=True)

In [None]:
df['category_sub'].isna().value_counts()

# Change date to date format

In [None]:
df['date_end'] = pd.to_datetime(df['date_end'])
df['date_launch'] = pd.to_datetime(df['date_launch'])
df['date_status_change'] = pd.to_datetime(df['date_status_change'])
df['date_creation'] = pd.to_datetime(df['date_creation'])

# Add new columns

In [None]:
# length of campaign
df['duration'] = df['date_end'] - df['date_launch']
# quality check
df['duration'].describe()

In [None]:
# length of description
df['description'] = df['description'].astype('string')
df['description'].dtypes

In [None]:
# Length of description + title of project
df['description_length'] = df['description'].apply(len)
# Quality check
df['description_length'].describe()

In [None]:
df['name_length'] = df['name'].apply(len)
# Quality check
df['name_length'].describe()

# Create final csv to use for all further models

In [None]:
#save merged raw dataframe as a file 
df.to_csv('data/2_data.csv')