In [2]:
import pyodbc
from dotenv import dotenv_values
import pandas as pd
import warnings
import numpy as np
import calendar
import matplotlib.pyplot as plt
import seaborn as sns
import requests

warnings.filterwarnings('ignore')

# Set display options for Pandas DataFrame
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#### Data Collection
##### Access the 2020 and 2021 data from Microsoft SQL Server
##### Connect to the database using provided credentials

In [3]:
# load environment variables from.env file into dictionary
environment_variables = dotenv_values('.env')

# get the values for the environment variables
server = environment_variables.get("DBS")
login = environment_variables.get("DBU")
password = environment_variables.get("DBP")
database = environment_variables.get("DBN")

# Create a database connection string using pyodbc
connection_string = f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={login};PWD={password}"

In [4]:
#Establish a connection to the database
connection = pyodbc.connect(connection_string)

##### Schema for tables in the database

In [5]:
# Define the SQL query to show specific tables in the database
db_query = """
        SELECT *
        FROM INFORMATION_SCHEMA.TABLES
        WHERE TABLE_SCHEMA = 'dbo'
        """
# Read data from the SQL query result into a DataFrame using the established database connection
schema_df = pd.read_sql(db_query, connection)

#  Check whether data has been retrieved successfully to confirm successful connection to database
try:
    scheschema_df = pd.read_sql(db_query, connection)
    print("Data retrieved successfully")
    print(schema_df)
except Exception as e:
    print("Failed to retrieve data:", e)

Data retrieved successfully
  TABLE_CATALOG TABLE_SCHEMA               TABLE_NAME  TABLE_TYPE
0         dapDB          dbo  LP1_startup_funding2021  BASE TABLE
1         dapDB          dbo  LP1_startup_funding2020  BASE TABLE


##### LP1_startup_funding2021 Data

In [6]:
#Define the SQL query to show specific tables in the database
query_2021 = """
        SELECT *
        FROM LP1_startup_funding2021        
        """
#Read data from the SQL query result into a dataframe
df_2021 = pd.read_sql(query_2021, connection)

# Display dataframe Info
df_2021.info()
print()

# Display the dataframe
df_2021.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1209 non-null   object 
 1   Founded        1208 non-null   float64
 2   HeadQuarter    1208 non-null   object 
 3   Sector         1209 non-null   object 
 4   What_it_does   1209 non-null   object 
 5   Founders       1205 non-null   object 
 6   Investor       1147 non-null   object 
 7   Amount         1206 non-null   object 
 8   Stage          781 non-null    object 
dtypes: float64(1), object(8)
memory usage: 85.1+ KB



Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage
0,Unbox Robotics,2019.0,Bangalore,AI startup,Unbox Robotics builds on-demand AI-driven ware...,"Pramod Ghadge, Shahid Memon","BEENEXT, Entrepreneur First","$1,200,000",Pre-series A
1,upGrad,2015.0,Mumbai,EdTech,UpGrad is an online higher education platform.,"Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...","Unilazer Ventures, IIFL Asset Management","$120,000,000",


##### LP1_startup_funding2020 Data

In [7]:
#Define the SQL query to show specific tables in the database
query_2020 = """
        SELECT *
        FROM LP1_startup_funding2020        
        """
#Read data from the SQL query result into a dataframe
df_2020 = pd.read_sql(query_2020, connection)

# Display dataframe Info
df_2020.info()
print()

# Display the dataframe
df_2020.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company_Brand  1055 non-null   object 
 1   Founded        842 non-null    float64
 2   HeadQuarter    961 non-null    object 
 3   Sector         1042 non-null   object 
 4   What_it_does   1055 non-null   object 
 5   Founders       1043 non-null   object 
 6   Investor       1017 non-null   object 
 7   Amount         801 non-null    float64
 8   Stage          591 non-null    object 
 9   column10       2 non-null      object 
dtypes: float64(2), object(8)
memory usage: 82.6+ KB



Unnamed: 0,Company_Brand,Founded,HeadQuarter,Sector,What_it_does,Founders,Investor,Amount,Stage,column10
0,Aqgromalin,2019.0,Chennai,AgriTech,Cultivating Ideas for Profit,"Prasanna Manogaran, Bharani C L",Angel investors,200000.0,,
1,Krayonnz,2019.0,Bangalore,EdTech,An academy-guardian-scholar centric ecosystem ...,"Saurabh Dixit, Gurudutt Upadhyay",GSF Accelerator,100000.0,Pre-seed,


#### 2019 Data from Onedrive

In [8]:
#CSV file path
file_path = "C:/Users/wolak/Documents/LP1 Datasets/startup_funding2019.csv"
#Read data from the file path into a dataframe
df_2019 = pd.read_csv(file_path)

# Display dataframe Info
df_2019.info()
print()

# Display the dataframe
df_2019.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company/Brand  89 non-null     object 
 1   Founded        60 non-null     float64
 2   HeadQuarter    70 non-null     object 
 3   Sector         84 non-null     object 
 4   What it does   89 non-null     object 
 5   Founders       86 non-null     object 
 6   Investor       89 non-null     object 
 7   Amount($)      89 non-null     object 
 8   Stage          43 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.4+ KB



Unnamed: 0,Company/Brand,Founded,HeadQuarter,Sector,What it does,Founders,Investor,Amount($),Stage
0,Bombay Shaving,,,Ecommerce,Provides a range of male grooming products,Shantanu Deshpande,Sixth Sense Ventures,"$6,300,000",
1,Ruangguru,2014.0,Mumbai,Edtech,A learning platform that provides topic-based ...,"Adamas Belva Syah Devara, Iman Usman.",General Atlantic,"$150,000,000",Series C


#### 2018 Data from Github repository

In [9]:
# url for the csv file
url = "https://raw.githubusercontent.com/Azubi-Africa/Career_Accelerator_LP1-Data_Analysis/main/startup_funding2018.csv"

# Local file path
local_path = r"C:\Users\wolak\Documents\LP1 Datasets\startup_funding2018.csv"

# Send a GET request to the URL
try:
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful
    # Save the response content to a local file
    with open(local_path, "wb") as f:
        f.write(response.content)
    print("File saved successfully")
except requests.exceptions.RequestException as e:
    print(f"Failed to download file: {e}")

# Read data from the file path into a dataframe
try:
    df_2018 = pd.read_csv(local_path)
    print("File read successfully")
except pd.errors.ParserError as e:
    print(f"Parser error: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

# Display dataframe Info
df_2018.info()
print()
# Display the dataframe
df_2018.head(2)

File saved successfully
File read successfully
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   526 non-null    object
 1   Industry       526 non-null    object
 2   Round/Series   526 non-null    object
 3   Amount         526 non-null    object
 4   Location       526 non-null    object
 5   About Company  526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB



Unnamed: 0,Company Name,Industry,Round/Series,Amount,Location,About Company
0,TheCollegeFever,"Brand Marketing, Event Promotion, Marketing, S...",Seed,250000,"Bangalore, Karnataka, India","TheCollegeFever is a hub for fun, fiesta and f..."
1,Happy Cow Dairy,"Agriculture, Farming",Seed,"₹40,000,000","Mumbai, Maharashtra, India",A startup which aggregates milk from dairy far...


#### Brief Data Standardization Considerations

In [10]:
# Print column names of all Dataframes
print("Column names of 2021 dataframe:")
print(df_2021.columns)

print("\nColumn names of 2020 dataframe:")
print(df_2020.columns)

print("\nColumn names of 2019 dataframe:")
print(df_2019.columns)

print("\nColumn names of 2018 dataframe:")
print(df_2018.columns)

Column names of 2021 dataframe:
Index(['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
       'Founders', 'Investor', 'Amount', 'Stage'],
      dtype='object')

Column names of 2020 dataframe:
Index(['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
       'Founders', 'Investor', 'Amount', 'Stage', 'column10'],
      dtype='object')

Column names of 2019 dataframe:
Index(['Company/Brand', 'Founded', 'HeadQuarter', 'Sector', 'What it does',
       'Founders', 'Investor', 'Amount($)', 'Stage'],
      dtype='object')

Column names of 2018 dataframe:
Index(['Company Name', 'Industry', 'Round/Series', 'Amount', 'Location',
       'About Company'],
      dtype='object')


In [11]:
# Create column names mapping dictionary

# Define the column mappings for each dataframe
column_mapping = {
    'Company_Brand': 'Company_Brand',
    'Founded': 'Founded',
    'HeadQuarter': 'HeadQuarter',
    'Sector': 'Sector',
    'What_it_does': 'What_it_does',
    'Founders': 'Founders',
    'Investor': 'Investor',
    'Amount': 'Amount',
    'column10': 'Column 10',
    'Stage': 'Stage',
    'Company/Brand': 'Company_Brand',
    'What it does': 'What_it_does',
    'Amount($)': 'Amount',
    'Company Name': 'Company_Brand',
    'Industry': 'Sector',
    'Round/Series': 'Stage',
    'Location': 'HeadQuarter',
    'About Company': 'What_it_does'
}

In [12]:
# Rename the columns

df_2021.rename(columns=column_mapping, inplace=True)
df_2020.rename(columns=column_mapping, inplace=True)
df_2019.rename(columns=column_mapping, inplace=True)
df_2018.rename(columns=column_mapping, inplace=True)

# Display renamed column names

print("Column names of 2021 dataframe:")
print(df_2021.columns)

print("\nColumn names of 2020 dataframe:")
print(df_2020.columns) 

print("\nColumn names of 2019 dataframe:")
print(df_2019.columns)

print("\nColumn names of 2018 dataframe:")
print(df_2018.columns)  

Column names of 2021 dataframe:
Index(['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
       'Founders', 'Investor', 'Amount', 'Stage'],
      dtype='object')

Column names of 2020 dataframe:
Index(['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
       'Founders', 'Investor', 'Amount', 'Stage', 'Column 10'],
      dtype='object')

Column names of 2019 dataframe:
Index(['Company_Brand', 'Founded', 'HeadQuarter', 'Sector', 'What_it_does',
       'Founders', 'Investor', 'Amount', 'Stage'],
      dtype='object')

Column names of 2018 dataframe:
Index(['Company_Brand', 'Sector', 'Stage', 'Amount', 'HeadQuarter',
       'What_it_does'],
      dtype='object')


#### Brief EDA on all dataframes

##### 2018 Dataframe

In [13]:
# Descriptive statistics on the dataframe
df_2018.describe().T

Unnamed: 0,count,unique,top,freq
Company_Brand,526,525,TheCollegeFever,2
Sector,526,405,—,30
Stage,526,21,Seed,280
Amount,526,198,—,148
HeadQuarter,526,50,"Bangalore, Karnataka, India",102
What_it_does,526,524,"TheCollegeFever is a hub for fun, fiesta and f...",2


In [14]:
# Check dataframe dimensions

print (df_2018.shape)
print(f"There are {df_2018.shape[0]} rows, and {df_2018.shape[1]} columns")

(526, 6)
There are 526 rows, and 6 columns


In [15]:
# Check dataframe info 

df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company_Brand  526 non-null    object
 1   Sector         526 non-null    object
 2   Stage          526 non-null    object
 3   Amount         526 non-null    object
 4   HeadQuarter    526 non-null    object
 5   What_it_does   526 non-null    object
dtypes: object(6)
memory usage: 24.8+ KB


In [16]:
# Check for total missing values in each column of the DataFrame
missing_values = df_2018.isnull().sum()
print()
print("Missing values in each column:")
print()
print(missing_values)
print()

# Check for total duplicated rows in the DataFrame
duplicate_count = df_2018.duplicated().sum()
print("Number of duplicated rows:", duplicate_count)
print()


Missing values in each column:

Company_Brand    0
Sector           0
Stage            0
Amount           0
HeadQuarter      0
What_it_does     0
dtype: int64

Number of duplicated rows: 1



In [17]:
df_2018.isnull().mean()

Company_Brand    0.0
Sector           0.0
Stage            0.0
Amount           0.0
HeadQuarter      0.0
What_it_does     0.0
dtype: float64

Prelimenary observations on 2018 dataset

The most frequent Stage was Seed

Most Headquarters were in Bangalore

#### 2019 Dataframe