In [1]:
# Import relevant libraries and packages.
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import statsmodels.api as sm 
from statsmodels.graphics.api import abline_plot 
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model, preprocessing 
import warnings 
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

import os 

In [2]:
os.getcwd()

'/Users/yusufsmacbookpro/Desktop/Springboard/Capstone1'

In [3]:
df = pd.read_csv('/Users/yusufsmacbookpro/Desktop/Springboard/Capstone1/start_up_cleaned.csv')

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
state_code,CA,CA,CA,CA,CA
latitude,42.3589,37.2389,32.901,37.3203,37.7793
longitude,-71.0568,-121.974,-117.193,-122.05,-122.419
zip_code,92101,95032,92121,95014,94105
city,San Diego,Los Gatos,San Diego,Cupertino,San Francisco
name,Bandsintown,TriCipher,Plixi,Solidcore Systems,Inhale Digital
labels,1,1,1,1,0
founded_at,2007-01-01,2000-01-01,2009-03-18,2002-01-01,2010-08-01
first_funding_at,2009-04-01,2005-02-14,2010-03-30,2005-02-17,2010-08-01
last_funding_at,2010-01-01,2009-12-28,2010-03-30,2007-04-25,2012-04-01


In [5]:
df.dtypes

state_code                   object
latitude                    float64
longitude                   float64
zip_code                     object
city                         object
name                         object
labels                        int64
founded_at                   object
first_funding_at             object
last_funding_at              object
age_first_funding_year      float64
age_last_funding_year       float64
age_first_milestone_year    float64
age_last_milestone_year     float64
relationships                 int64
funding_rounds                int64
funding_total_usd             int64
milestones                    int64
is_CA                         int64
is_NY                         int64
is_MA                         int64
is_TX                         int64
is_otherstate                 int64
category_code                object
is_software                   int64
is_web                        int64
is_mobile                     int64
is_enterprise               

In [6]:
#we need to convert the colums below from object to datetime
df.founded_at=pd.to_datetime(df.founded_at)
df.first_funding_at=pd.to_datetime(df.first_funding_at)
df.last_funding_at=pd.to_datetime(df.last_funding_at)

df.head(3)

Unnamed: 0,state_code,latitude,longitude,zip_code,city,name,labels,founded_at,first_funding_at,last_funding_at,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,CA,42.35888,-71.05682,92101,San Diego,Bandsintown,1,2007-01-01,2009-04-01,2010-01-01,...,c:6669,0,1,0,0,0,0,1.0,0,acquired
1,CA,37.238916,-121.973718,95032,Los Gatos,TriCipher,1,2000-01-01,2005-02-14,2009-12-28,...,c:16283,1,0,0,1,1,1,4.75,1,acquired
2,CA,32.901049,-117.192656,92121,San Diego,Plixi,1,2009-03-18,2010-03-30,2010-03-30,...,c:65620,0,0,1,0,0,0,4.0,1,acquired


In [7]:
df.dtypes

state_code                          object
latitude                           float64
longitude                          float64
zip_code                            object
city                                object
name                                object
labels                               int64
founded_at                  datetime64[ns]
first_funding_at            datetime64[ns]
last_funding_at             datetime64[ns]
age_first_funding_year             float64
age_last_funding_year              float64
age_first_milestone_year           float64
age_last_milestone_year            float64
relationships                        int64
funding_rounds                       int64
funding_total_usd                    int64
milestones                           int64
is_CA                                int64
is_NY                                int64
is_MA                                int64
is_TX                                int64
is_otherstate                        int64
category_co

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   state_code                922 non-null    object        
 1   latitude                  922 non-null    float64       
 2   longitude                 922 non-null    float64       
 3   zip_code                  922 non-null    object        
 4   city                      922 non-null    object        
 5   name                      922 non-null    object        
 6   labels                    922 non-null    int64         
 7   founded_at                922 non-null    datetime64[ns]
 8   first_funding_at          922 non-null    datetime64[ns]
 9   last_funding_at           922 non-null    datetime64[ns]
 10  age_first_funding_year    922 non-null    float64       
 11  age_last_funding_year     922 non-null    float64       
 12  age_first_milestone_ye

In [9]:
df.shape

(922, 44)

In [10]:
#we see missing values for age first milestone year and age last milestone year so we're going to fill with mean()
df["age_first_milestone_year"].fillna((df["age_first_milestone_year"].mean()), inplace=True)
df["age_last_milestone_year"].fillna((df["age_last_milestone_year"].mean()), inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   state_code                922 non-null    object        
 1   latitude                  922 non-null    float64       
 2   longitude                 922 non-null    float64       
 3   zip_code                  922 non-null    object        
 4   city                      922 non-null    object        
 5   name                      922 non-null    object        
 6   labels                    922 non-null    int64         
 7   founded_at                922 non-null    datetime64[ns]
 8   first_funding_at          922 non-null    datetime64[ns]
 9   last_funding_at           922 non-null    datetime64[ns]
 10  age_first_funding_year    922 non-null    float64       
 11  age_last_funding_year     922 non-null    float64       
 12  age_first_milestone_ye

In [12]:
#since were trying to predict the outcome, were going to drop the actual and replace them with dummie results for now
df=pd.get_dummies(df, columns=["status"], drop_first=True)
df.head(10).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
state_code,CA,CA,CA,CA,CA,CA,CA,CA,MA,CA
latitude,42.3589,37.2389,32.901,37.3203,37.7793,37.4069,37.3916,38.0571,42.7122,37.4272
longitude,-71.0568,-121.974,-117.193,-122.05,-122.419,-122.09,-122.07,-122.514,-73.2036,-122.146
zip_code,92101,95032,92121,95014,94105,94043,94041,94901,1267,94306
city,San Diego,Los Gatos,San Diego,Cupertino,San Francisco,Mountain View,Mountain View,San Rafael,Williamstown,Palo Alto
name,Bandsintown,TriCipher,Plixi,Solidcore Systems,Inhale Digital,Matisse Networks,RingCube Technologies,ClairMail,VoodooVox,Doostang
labels,1,1,1,1,0,0,1,1,1,1
founded_at,2007-01-01 00:00:00,2000-01-01 00:00:00,2009-03-18 00:00:00,2002-01-01 00:00:00,2010-08-01 00:00:00,2002-01-01 00:00:00,2005-01-01 00:00:00,2004-01-01 00:00:00,2002-01-01 00:00:00,2005-06-01 00:00:00
first_funding_at,2009-04-01 00:00:00,2005-02-14 00:00:00,2010-03-30 00:00:00,2005-02-17 00:00:00,2010-08-01 00:00:00,2006-07-18 00:00:00,2006-09-21 00:00:00,2005-08-24 00:00:00,2005-08-02 00:00:00,2007-02-01 00:00:00
last_funding_at,2010-01-01 00:00:00,2009-12-28 00:00:00,2010-03-30 00:00:00,2007-04-25 00:00:00,2012-04-01 00:00:00,2006-07-18 00:00:00,2010-03-18 00:00:00,2010-10-04 00:00:00,2013-02-08 00:00:00,2010-02-05 00:00:00


In [13]:
df.drop(['latitude', 'longitude', 'zip_code', 'city', 'labels', 'object_id', 'is_top500'], axis=1, inplace=True)

In [15]:
df.head().T

Unnamed: 0,0,1,2,3,4
state_code,CA,CA,CA,CA,CA
name,Bandsintown,TriCipher,Plixi,Solidcore Systems,Inhale Digital
founded_at,2007-01-01 00:00:00,2000-01-01 00:00:00,2009-03-18 00:00:00,2002-01-01 00:00:00,2010-08-01 00:00:00
first_funding_at,2009-04-01 00:00:00,2005-02-14 00:00:00,2010-03-30 00:00:00,2005-02-17 00:00:00,2010-08-01 00:00:00
last_funding_at,2010-01-01 00:00:00,2009-12-28 00:00:00,2010-03-30 00:00:00,2007-04-25 00:00:00,2012-04-01 00:00:00
age_first_funding_year,2.2493,5.126,1.0329,3.1315,0
age_last_funding_year,3.0027,9.9973,1.0329,5.3151,1.6685
age_first_milestone_year,4.6685,7.0055,1.4575,6.0027,0.0384
age_last_milestone_year,6.7041,7.0055,2.2055,6.0027,0.0384
relationships,3,9,5,5,2


In [16]:
df = df.round(decimals=2)

In [17]:
df.head().T

Unnamed: 0,0,1,2,3,4
state_code,CA,CA,CA,CA,CA
name,Bandsintown,TriCipher,Plixi,Solidcore Systems,Inhale Digital
founded_at,2007-01-01 00:00:00,2000-01-01 00:00:00,2009-03-18 00:00:00,2002-01-01 00:00:00,2010-08-01 00:00:00
first_funding_at,2009-04-01 00:00:00,2005-02-14 00:00:00,2010-03-30 00:00:00,2005-02-17 00:00:00,2010-08-01 00:00:00
last_funding_at,2010-01-01 00:00:00,2009-12-28 00:00:00,2010-03-30 00:00:00,2007-04-25 00:00:00,2012-04-01 00:00:00
age_first_funding_year,2.25,5.13,1.03,3.13,0
age_last_funding_year,3,10,1.03,5.32,1.67
age_first_milestone_year,4.67,7.01,1.46,6,0.04
age_last_milestone_year,6.7,7.01,2.21,6,0.04
relationships,3,9,5,5,2


In [18]:
df.to_csv('/Users/yusufsmacbookpro/Desktop/start_up_updated_clean.csv', index = False)