# Data Cleaning

In [1]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
#import csv file
startup_data = pd.read_csv('ks-projects-201801.csv')
startup_data.head()

print("Data type : ", type(startup_data))
print("Data dims : ", startup_data.shape)

In [None]:
startup_data.info()

In [None]:
startup_data.drop(columns=['name','ID','category','usd pledged','currency','goal','pledged'], axis = 1, inplace = True)

In [None]:
startup_data = startup_data[startup_data['state'].isin(['failed', 'successful'])]
startup_data.sample(10)

1. Here, we can see the various types of status of the startups but we are concerned  only with successful and failed startups as we are predicting this category.
2. Thus we remove all other categories such as 'canceled', 'live', 'undefined', 'suspended'  and only keep the remaining two.
3. We examine the null values and see that columns 'name' and 'usd pledged' have null values but we do not need to correct this as these columns are unnecessary in our problem so we will ultimately drop them.
4. We remove all unnecesary columns that are not going to help predict the success of the startup such as 'id', 'name' etc.


In [None]:
startup_data = startup_data.assign(state_num = (startup_data['state'] == 'successful').astype(int))
startup_data.drop(columns=['state'], axis = 1, inplace = True)
startup_data

In [None]:

country_encoder = LabelEncoder()
main_category_encoder = LabelEncoder()

startup_data['encoded_country'] = country_encoder.fit_transform(startup_data['country'])
startup_data['encoded_main_category'] = main_category_encoder.fit_transform(startup_data['main_category'])

startup_data

In [None]:
startup_data.deadline = pd.to_datetime(startup_data.deadline)
startup_data.launched = pd.to_datetime(startup_data.launched)
startup_data

In [None]:
startup_data['Duration_of_campaign(days)'] = (startup_data['deadline'] - startup_data.launched).dt.days 

startup_data['Deadline(y)']=pd.to_datetime(startup_data.deadline).dt.year 
startup_data['Launched(y)']=pd.to_datetime(startup_data.launched).dt.year 

startup_data['Launched(m)']=pd.to_datetime(startup_data.launched).dt.month
startup_data['Deadline(m)']=pd.to_datetime(startup_data.deadline).dt.month 
startup_data.drop(columns=['deadline','launched'], axis = 1, inplace = True)
startup_data

1. As 'state' is a categorical variable, we need to convert this into numerical by assigning 'successful' as 1 and 'failed' as 0 in a new column named 'state_num'. We then drop the original state column.
2. We then use LabelEncoder function from scikit-learn to encode categorical columns in the DataFrame startup_data. Then, it initializes LabelEncoder objects for each categorical column and fits them to the respective columns. After fitting, it transforms the columns into encoded numerical values. Finally, it adds new columns to the DataFrame (encoded_country and encoded_main_category) to store the encoded data, leaving the original columns (country and main_category) intact.
3. Then we convert the deadline and launched columns in the startup_data DataFrame to datetime objects, enabling easier manipulation and analysis of date and time data using to_datetime functions
4. We then calculate the duration of the campaign by subtracting the deadline and launched dates for easy manipulations and analysis.
5. We also convert the datetime variables into seperate variables of Deadline(y),Launched(y),Launched(m),Deadline(m) which show the respective years and months of the data, and then we drop the original deadline and launched columns.






# Data analysis (Numerical)

# Data analysis (Categorical)