# Setup

In [None]:
!pip install datacleaner



In [None]:
from IPython.display import display
import pandas as pd
import numpy as np
from datacleaner import autoclean

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Settings
pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# data read encoding changed to the russian windows version.
gDrivePath = '/gdrive/MyDrive/Colab Notebooks/data/'
original_data = pd.read_csv(gDrivePath+'SS_data.csv', encoding='windows-1251')

In [None]:
def col_to_numeric(col):
    try:
        return pd.to_numeric(col)
    except:
        return col

def binaryCol(col):
    try:
        return pd.get_dummies(col, drop_first=True) if col.nunique() == 2 else col
    except TypeError:
        return col
    
def extractYearFromDate(row):
    return int(row.split('/')[-1]) if type(row)==str else row

# Outputs the columns that need to be dropped
def to_drop(dataset, corr_threshold, useless_columns):
    # Create correlation matrix
    corr_matrix = dataset.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index of feature columns with correlation greater than corr_threshold
    drop = [column for column in upper.columns if any(upper[column] > corr_threshold)]
    drop = drop + useless_columns
    return drop

def conditionalSplit(sent):
    if str(sent) != 'nan':
        return list(map(str.strip, str(sent).replace(',', '@').replace('&', '@').replace(' and ', '@').split('@')))
    return sent

In [None]:
def guess_data_type(column):
    value_counts = column.value_counts()
    if len(value_counts) == 2:
        return 'Binary'
    elif (len(value_counts) / column.shape[0]) < 0.05:
        return 'Categorical'
    else:
        return 'Text'

def low_freq_combine(data, threshold, value='other'):
    high_freq = data.value_counts()[(data.value_counts() / data.notnull().sum()) >= threshold].index
    data = data.where((data.isin(high_freq) | data.isnull()), other=value)
    return data

# Dataset overview

In [None]:
# Dataset preview
original_data.info()
original_data.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472 entries, 0 to 471
Columns: 116 entries, Company_Name to Renown score
dtypes: float64(5), int64(3), object(108)
memory usage: 427.9+ KB


Unnamed: 0,Company_Name,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Short Description of company profile,Industry of company,Focus functions of company,Investors,Employee Count,Employees count MoM change,Has the team size grown,Est. Founding Date,Last Funding Date,Last Funding Amount,Country of company,Continent of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of Co-founders,Number of of advisors,Team size Senior leadership,Team size all employees,Presence of a top angel or venture fund in previous round of investment,Number of of repeat investors,Number of Sales Support material,Worked in top companies,Average size of companies worked for in the past,Have been part of startups in the past?,Have been part of successful startups in the past?,Was he or she partner in Big 5 consulting?,Consulting experience?,Product or service company?,Catering to product/service across verticals,Focus on private or public data?,Focus on consumer data?,Focus on structured or unstructured data,Subscription based business,Cloud or platform based serive/product?,Local or global player,Linear or Non-linear business model,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Number of of Partners of company,Crowdsourcing based business,Crowdfunding based business,Machine Learning based business,Predictive Analytics business,Speech analytics business,Prescriptive analytics business,Big Data Business,Cross-Channel Analytics/ marketing channels,Owns data or not? (monetization of data) e.g. Factual,Is the company an aggregator/market place? e.g. Bluekai,Online or offline venture - physical location based business or online venture?,B2C or B2B venture?,Top forums like 'Tech crunch' or 'Venture beat' talking about the company/model - How much is it being talked about?,Average Years of experience for founder and co founder,Exposure across the globe,Breadth of experience across verticals,Highest education,Years of education,Specialization of highest education,Relevance of education to venture,Relevance of experience to venture,Degree from a Tier 1 or Tier 2 university?,Renowned in professional circle,Experience in selling and building products,Experience in Fortune 100 organizations,Experience in Fortune 500 organizations,Experience in Fortune 1000 organizations,Top management similarity,Number of Recognitions for Founders and Co-founders,Number of of Research publications,Skills score,Team Composition score,Dificulty of Obtaining Work force,Pricing Strategy,Hyper localisation,Time to market service or product,Employee benefits and salary structures,Long term relationship with other founders,Proprietary or patent position (competitive position),Barriers of entry for the competitors,Company awards,Controversial history of founder or co founder,Legal risk and intellectual property,Client Reputation,google page rank of company website,Technical proficiencies to analyse and interpret unstructured data,Solutions offered,Invested through global incubation competitions?,Industry trend in investing,Disruptiveness of technology,Number of Direct competitors,Employees per year of company existence,Last round of funding received (in milionUSD),"Survival through recession, based on existence of the company through recession times",Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment",Gartner hype cycle stage,Time to maturity of technology (in years),Percent_skill_Entrepreneurship,Percent_skill_Operations,Percent_skill_Engineering,Percent_skill_Marketing,Percent_skill_Leadership,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score
0,Company1,Success,No Info,No Info,-1.0,Video distribution,,operation,KPCB Holdings|Draper Fisher Jurvetson (DFJ)|Kl...,3.0,0.0,No,,5/26/2013,450000.0,United States,North America,2,0,1,2,2,15,Yes,4,Nothing,No,Small,No,No,No,No,Service,No,Private,No,Both,Yes,Platform,Global,Linear,Yes,,No,No,No,No,No,No,No,No,No,Yes,Online,B2C,High,High,Yes,Low,Masters,21,business,Yes,Yes,Tier_1,500,Medium,0,0,0,,0,,0,Low,Low,Yes,No,High,No Info,No,No,Yes,No,No,No,No Info,9626884,No,Yes,No,2.0,Low,0,1.5,0.45,No Info,No Info,11.56,,,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0
1,Company2,Success,2011,3,125.0,,Market Research|Marketing|Crowdfunding,"Marketing, sales",,,,No,,,,United States,North America,5,0,2,0,4,20,No,0,medium,Yes,Large,Yes,Yes,No,No,Product,No,Public,Yes,Both,No,Platform,Local,Non-Linear,No,Few,Yes,No,Yes,Yes,No,No,Yes,Yes,Yes,No,Online,B2C,Low,High,Yes,High,Masters,21,Supply Chain Management & Entrepreneurship,Yes,Yes,Tier_1,500,High,0,0,0,Medium,13,,34,High,Medium,Yes,No,Low,No Info,No,Yes,Yes,No,No,Yes,Medium,1067034,Yes,Yes,No,3.0,Medium,0,6.666666667,5.0,Not Applicable,10,9.0,Trough,2 to 5,15.88235294,11.76470588,15,12.94117647,0,8.823529412,21.76470588,10.88235294,2.941176471,0,0,0,0,0,8


In [None]:
# Columns with unique values
original_data.nunique()[original_data.nunique() > len(original_data)/2]
# Company_Name could be an index or dropped

Company_Name                            472
Internet Activity Score                 260
Short Description of company profile    313
Investors                               319
Last Funding Date                       284
google page rank of company website     311
dtype: int64

# Data Cleanup Process

In [None]:
# identifying dataset features
to_drop = ['Investors']
missingIdentifiers = ['n/a', 'nan', 'no info', 'unknown amount']

In [None]:
# 1) Cleanup: lowercase objects, find NaNs, map ordinal values, find numeric columns
# 2) Cleanup: drop useless columns
data = original_data.apply(lambda col: col.str.strip().str.lower() if col.dtype == 'object' else col)
data = data.applymap(lambda val: np.NaN if str(val).strip().lower() in missingIdentifiers else val)
data = data.apply(col_to_numeric)
data = data.drop(to_drop, axis=1, errors='ignore')

# Datetime columns
data['Est. Founding Date'] = data['Est. Founding Date'].apply(extractYearFromDate)
data['Last Funding Date'] = data['Last Funding Date'].apply(extractYearFromDate)

# Special columns
data['Country of company'] = low_freq_combine(data['Country of company'], 0.01, 'other country')
data['Continent of company'] = low_freq_combine(data['Continent of company'], 0.01, 'other continent')

text_columns = ['Short Description of company profile', 'Industry of company', 
                'Focus functions of company', 'Specialization of highest education']
splitters = {'&': ' ', ',': ' ', 'and': ' ', '|':' '}

data['Short Description of company profile'].str.lower().fillna('') 
data['Industry of company'] = data['Industry of company'].str.lower().fillna('').apply(lambda row: row.replace('|', ' ') if str(row) != 'nan' else row)
data['Focus functions of company'] = data['Focus functions of company'].str.lower().fillna('').apply(lambda x: x.translate(splitters))
data['Specialization of highest education'] = data['Specialization of highest education'].str.lower().fillna('').apply(lambda x: x.translate(splitters))

data['Description'] = data[text_columns].fillna('').apply(' '.join, axis=1)

# Adding dummies to dataset
clean_dataset = data.drop(text_columns, axis=1)

In [None]:
print('Shape:',clean_dataset.shape)
clean_dataset.head()

Shape: (472, 112)


Unnamed: 0,Company_Name,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Employee Count,Employees count MoM change,Has the team size grown,Est. Founding Date,Last Funding Date,Last Funding Amount,Country of company,Continent of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of Co-founders,Number of of advisors,Team size Senior leadership,Team size all employees,Presence of a top angel or venture fund in previous round of investment,Number of of repeat investors,Number of Sales Support material,Worked in top companies,Average size of companies worked for in the past,Have been part of startups in the past?,Have been part of successful startups in the past?,Was he or she partner in Big 5 consulting?,Consulting experience?,Product or service company?,Catering to product/service across verticals,Focus on private or public data?,Focus on consumer data?,Focus on structured or unstructured data,Subscription based business,Cloud or platform based serive/product?,Local or global player,Linear or Non-linear business model,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Number of of Partners of company,Crowdsourcing based business,Crowdfunding based business,Machine Learning based business,Predictive Analytics business,Speech analytics business,Prescriptive analytics business,Big Data Business,Cross-Channel Analytics/ marketing channels,Owns data or not? (monetization of data) e.g. Factual,Is the company an aggregator/market place? e.g. Bluekai,Online or offline venture - physical location based business or online venture?,B2C or B2B venture?,Top forums like 'Tech crunch' or 'Venture beat' talking about the company/model - How much is it being talked about?,Average Years of experience for founder and co founder,Exposure across the globe,Breadth of experience across verticals,Highest education,Years of education,Relevance of education to venture,Relevance of experience to venture,Degree from a Tier 1 or Tier 2 university?,Renowned in professional circle,Experience in selling and building products,Experience in Fortune 100 organizations,Experience in Fortune 500 organizations,Experience in Fortune 1000 organizations,Top management similarity,Number of Recognitions for Founders and Co-founders,Number of of Research publications,Skills score,Team Composition score,Dificulty of Obtaining Work force,Pricing Strategy,Hyper localisation,Time to market service or product,Employee benefits and salary structures,Long term relationship with other founders,Proprietary or patent position (competitive position),Barriers of entry for the competitors,Company awards,Controversial history of founder or co founder,Legal risk and intellectual property,Client Reputation,google page rank of company website,Technical proficiencies to analyse and interpret unstructured data,Solutions offered,Invested through global incubation competitions?,Industry trend in investing,Disruptiveness of technology,Number of Direct competitors,Employees per year of company existence,Last round of funding received (in milionUSD),"Survival through recession, based on existence of the company through recession times",Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment",Gartner hype cycle stage,Time to maturity of technology (in years),Percent_skill_Entrepreneurship,Percent_skill_Operations,Percent_skill_Engineering,Percent_skill_Marketing,Percent_skill_Leadership,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score,Description
0,company1,success,,,-1.0,3.0,0.0,no,,2013.0,450000.0,united states,north america,2.0,0.0,1,2,2,15.0,yes,4.0,nothing,no,small,no,no,no,no,service,no,private,no,both,yes,platform,global,linear,yes,none,no,no,no,no,no,no,no,no,no,yes,online,b2c,high,high,yes,low,masters,21.0,yes,yes,tier_1,500.0,medium,0.0,0.0,0.0,none,0.0,none,0.0,low,low,yes,no,high,,no,no,yes,no,no,no,,9626884.0,no,yes,no,2.0,low,0.0,1.5,0.45,,,11.56,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,video distribution operation business
1,company2,success,2011.0,3.0,125.0,,,no,,,,united states,north america,5.0,0.0,2,0,4,20.0,no,0.0,medium,yes,large,yes,yes,no,no,product,no,public,yes,both,no,platform,local,non-linear,no,few,yes,no,yes,yes,no,no,yes,yes,yes,no,online,b2c,low,high,yes,high,masters,21.0,yes,yes,tier_1,500.0,high,0.0,0.0,0.0,medium,13.0,none,34.0,high,medium,yes,no,low,,no,yes,yes,no,no,yes,medium,1067034.0,yes,yes,no,3.0,medium,0.0,6.666667,5.0,not applicable,10.0,9.0,trough,2 to 5,15.882353,11.764706,15.0,12.941176,0.0,8.823529,21.764706,10.882353,2.941176,0.0,0.0,0.0,0.0,0.0,8.0,market research marketing crowdfunding market...
2,company3,success,2011.0,3.0,455.0,14.0,0.0,no,2011.0,2013.0,2350000.0,united states,north america,15.0,0.0,3,0,7,10.0,no,0.0,low,yes,medium,no,no,no,no,both,yes,private,yes,both,yes,cloud,local,non-linear,no,few,no,no,no,yes,no,no,yes,no,no,no,online,b2b,low,medium,yes,low,bachelors,18.0,yes,yes,tier_2,500.0,high,0.0,0.0,1.0,medium,18.0,none,36.0,high,medium,yes,no,low,,yes,yes,yes,no,no,no,low,71391.0,yes,yes,yes,3.0,medium,0.0,3.333333,2.35,not applicable,2.0,7.344444,trough,2 to 5,9.401709,0.0,57.478632,0.0,0.0,3.846154,17.094017,9.401709,0.0,2.777778,0.0,0.0,0.0,0.0,9.0,event data analytics api analytics cloud compu...
3,company4,success,2009.0,5.0,-99.0,45.0,10.0,no,2009.0,2012.0,10250000.0,united states,north america,6.0,0.0,2,0,4,50.0,yes,0.0,low,no,large,yes,yes,no,no,product,yes,public,yes,structured,yes,platform,local,non-linear,no,few,yes,no,no,no,no,no,no,no,no,no,online,b2c,medium,medium,yes,low,bachelors,18.0,yes,yes,tier_2,,low,0.0,0.0,0.0,medium,2.0,none,15.5,medium,medium,yes,no,low,good,no,yes,yes,no,no,no,low,11847.0,no,yes,yes,4.0,medium,2.0,10.0,10.25,not applicable,1.0,8.7,trough,2 to 5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,the most advanced analytics for mobile mobile ...
4,company5,success,2010.0,4.0,496.0,39.0,3.0,no,2010.0,2013.0,5500000.0,united states,north america,7.0,0.0,1,1,8,40.0,no,0.0,high,no,small,no,no,no,no,product,yes,public,yes,both,no,platform,local,non-linear,yes,few,no,no,no,no,no,no,yes,no,no,no,online,b2b,low,high,yes,medium,bachelors,18.0,yes,yes,none,500.0,high,0.0,0.0,0.0,low,5.0,few,23.0,medium,medium,yes,no,low,bad,yes,yes,yes,no,no,no,low,201814.0,yes,yes,no,3.0,medium,0.0,10.0,5.5,not applicable,13.0,9.822222,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,the location-based marketing platform analytic...


In [None]:
dataset = clean_dataset.set_index('Company_Name')
dataset = dataset.drop_duplicates()
display(dataset.head())
print(dataset.shape)

Unnamed: 0_level_0,Dependent-Company Status,year of founding,Age of company in years,Internet Activity Score,Employee Count,Employees count MoM change,Has the team size grown,Est. Founding Date,Last Funding Date,Last Funding Amount,Country of company,Continent of company,Number of Investors in Seed,Number of Investors in Angel and or VC,Number of Co-founders,Number of of advisors,Team size Senior leadership,Team size all employees,Presence of a top angel or venture fund in previous round of investment,Number of of repeat investors,Number of Sales Support material,Worked in top companies,Average size of companies worked for in the past,Have been part of startups in the past?,Have been part of successful startups in the past?,Was he or she partner in Big 5 consulting?,Consulting experience?,Product or service company?,Catering to product/service across verticals,Focus on private or public data?,Focus on consumer data?,Focus on structured or unstructured data,Subscription based business,Cloud or platform based serive/product?,Local or global player,Linear or Non-linear business model,"Capital intensive business e.g. e-commerce, Engineering products and operations can also cause a business to be capital intensive",Number of of Partners of company,Crowdsourcing based business,Crowdfunding based business,Machine Learning based business,Predictive Analytics business,Speech analytics business,Prescriptive analytics business,Big Data Business,Cross-Channel Analytics/ marketing channels,Owns data or not? (monetization of data) e.g. Factual,Is the company an aggregator/market place? e.g. Bluekai,Online or offline venture - physical location based business or online venture?,B2C or B2B venture?,Top forums like 'Tech crunch' or 'Venture beat' talking about the company/model - How much is it being talked about?,Average Years of experience for founder and co founder,Exposure across the globe,Breadth of experience across verticals,Highest education,Years of education,Relevance of education to venture,Relevance of experience to venture,Degree from a Tier 1 or Tier 2 university?,Renowned in professional circle,Experience in selling and building products,Experience in Fortune 100 organizations,Experience in Fortune 500 organizations,Experience in Fortune 1000 organizations,Top management similarity,Number of Recognitions for Founders and Co-founders,Number of of Research publications,Skills score,Team Composition score,Dificulty of Obtaining Work force,Pricing Strategy,Hyper localisation,Time to market service or product,Employee benefits and salary structures,Long term relationship with other founders,Proprietary or patent position (competitive position),Barriers of entry for the competitors,Company awards,Controversial history of founder or co founder,Legal risk and intellectual property,Client Reputation,google page rank of company website,Technical proficiencies to analyse and interpret unstructured data,Solutions offered,Invested through global incubation competitions?,Industry trend in investing,Disruptiveness of technology,Number of Direct competitors,Employees per year of company existence,Last round of funding received (in milionUSD),"Survival through recession, based on existence of the company through recession times",Time to 1st investment (in months),"Avg time to investment - average across all rounds, measured from previous investment",Gartner hype cycle stage,Time to maturity of technology (in years),Percent_skill_Entrepreneurship,Percent_skill_Operations,Percent_skill_Engineering,Percent_skill_Marketing,Percent_skill_Leadership,Percent_skill_Data Science,Percent_skill_Business Strategy,Percent_skill_Product Management,Percent_skill_Sales,Percent_skill_Domain,Percent_skill_Law,Percent_skill_Consulting,Percent_skill_Finance,Percent_skill_Investment,Renown score,Description
Company_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1
company1,success,,,-1.0,3.0,0.0,no,,2013.0,450000.0,united states,north america,2.0,0.0,1,2,2,15.0,yes,4.0,nothing,no,small,no,no,no,no,service,no,private,no,both,yes,platform,global,linear,yes,none,no,no,no,no,no,no,no,no,no,yes,online,b2c,high,high,yes,low,masters,21.0,yes,yes,tier_1,500.0,medium,0.0,0.0,0.0,none,0.0,none,0.0,low,low,yes,no,high,,no,no,yes,no,no,no,,9626884.0,no,yes,no,2.0,low,0.0,1.5,0.45,,,11.56,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,video distribution operation business
company2,success,2011.0,3.0,125.0,,,no,,,,united states,north america,5.0,0.0,2,0,4,20.0,no,0.0,medium,yes,large,yes,yes,no,no,product,no,public,yes,both,no,platform,local,non-linear,no,few,yes,no,yes,yes,no,no,yes,yes,yes,no,online,b2c,low,high,yes,high,masters,21.0,yes,yes,tier_1,500.0,high,0.0,0.0,0.0,medium,13.0,none,34.0,high,medium,yes,no,low,,no,yes,yes,no,no,yes,medium,1067034.0,yes,yes,no,3.0,medium,0.0,6.666667,5.0,not applicable,10.0,9.0,trough,2 to 5,15.882353,11.764706,15.0,12.941176,0.0,8.823529,21.764706,10.882353,2.941176,0.0,0.0,0.0,0.0,0.0,8.0,market research marketing crowdfunding market...
company3,success,2011.0,3.0,455.0,14.0,0.0,no,2011.0,2013.0,2350000.0,united states,north america,15.0,0.0,3,0,7,10.0,no,0.0,low,yes,medium,no,no,no,no,both,yes,private,yes,both,yes,cloud,local,non-linear,no,few,no,no,no,yes,no,no,yes,no,no,no,online,b2b,low,medium,yes,low,bachelors,18.0,yes,yes,tier_2,500.0,high,0.0,0.0,1.0,medium,18.0,none,36.0,high,medium,yes,no,low,,yes,yes,yes,no,no,no,low,71391.0,yes,yes,yes,3.0,medium,0.0,3.333333,2.35,not applicable,2.0,7.344444,trough,2 to 5,9.401709,0.0,57.478632,0.0,0.0,3.846154,17.094017,9.401709,0.0,2.777778,0.0,0.0,0.0,0.0,9.0,event data analytics api analytics cloud compu...
company4,success,2009.0,5.0,-99.0,45.0,10.0,no,2009.0,2012.0,10250000.0,united states,north america,6.0,0.0,2,0,4,50.0,yes,0.0,low,no,large,yes,yes,no,no,product,yes,public,yes,structured,yes,platform,local,non-linear,no,few,yes,no,no,no,no,no,no,no,no,no,online,b2c,medium,medium,yes,low,bachelors,18.0,yes,yes,tier_2,,low,0.0,0.0,0.0,medium,2.0,none,15.5,medium,medium,yes,no,low,good,no,yes,yes,no,no,no,low,11847.0,no,yes,yes,4.0,medium,2.0,10.0,10.25,not applicable,1.0,8.7,trough,2 to 5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,the most advanced analytics for mobile mobile ...
company5,success,2010.0,4.0,496.0,39.0,3.0,no,2010.0,2013.0,5500000.0,united states,north america,7.0,0.0,1,1,8,40.0,no,0.0,high,no,small,no,no,no,no,product,yes,public,yes,both,no,platform,local,non-linear,yes,few,no,no,no,no,no,no,yes,no,no,no,online,b2b,low,high,yes,medium,bachelors,18.0,yes,yes,none,500.0,high,0.0,0.0,0.0,low,5.0,few,23.0,medium,medium,yes,no,low,bad,yes,yes,yes,no,no,no,low,201814.0,yes,yes,no,3.0,medium,0.0,10.0,5.5,not applicable,13.0,9.822222,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,the location-based marketing platform analytic...


(470, 111)


In [None]:
dataset.to_csv(gDrivePath+'SS_dataset.csv')