In [45]:
from pyspark.sql.functions import col, regexp_replace, when,count,sum,round,concat_ws,lit,format_number,explode,split
from pyspark.sql.session import SparkSession
import pandas as pd
import openpyxl

In [46]:
spark = SparkSession\
    .builder\
    .appName("Shark Tank Analysis")\
    .master("local[*]") \
    .getOrCreate()

In [47]:
shark_tank_df = spark\
    .read\
    .format("csv")\
    .option("header",True)\
    .option("inferSchema",True)\
    .load("/Users/sahilnagpal/Desktop/byte-building/SharkTankIndia/SharkTankIndia.csv")

In [48]:
# replace space in column

new_column_names = [c.replace(" ", "_").strip() for c in shark_tank_df.columns]
shark_tank_df = shark_tank_df.toDF(*new_column_names)

In [49]:
# Objective1: Analyze the number of startups, deals made, and total investment per season.


obj1_df = shark_tank_df.select("Season_Number", "Startup_Name", "Total_Deal_Amount", "Total_Deal_Equity") \
    .groupBy(col("Season_Number")) \
    .agg(count("Startup_Name").alias("total_start_ups"),
         round(sum("Total_Deal_Amount").cast("double"), 2).alias("sum_of_total_deals"),
         round(sum("Total_Deal_Equity").cast("double"), 2).alias("sum_of_total_equity"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .drop("Season_Number")
obj1_pd_df = obj1_df.toPandas()


In [50]:
# Objective2: Identify which industries received the most investments and deals.

from pyspark.sql.functions import col, count, sum, round

obj2_df = shark_tank_df.select("Season_Number","Industry", "Startup_Name", "Total_Deal_Amount", "Valuation_Requested", "Original_Ask_Amount", "Original_Offered_Equity") \
    .filter((col("Received_Offer") == 1) & (col("Accepted_Offer") == 1)) \
    .withColumn("Valuation_INR", (col("Original_Ask_Amount") * 100000) / (col("Original_Offered_Equity") / 100)) \
    .groupBy("Industry","Season_Number") \
    .agg(
        count("Startup_Name").alias("count_of_start_ups"),
        round(sum("Total_Deal_Amount").cast("double"), 2).alias("sum_of_total_deals"),
        round(sum(col("Valuation_INR").cast("double")),2).alias("sum_of_total_valuation")) \
    .withColumn("sum_of_total_valuation", format_number(col("sum_of_total_valuation"), 0)) \
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .drop("Season_Number")


obj2_pd_df = obj2_df.toPandas()

In [51]:
# Objective4: Identify which cities and states have the most startups pitching.

obj3_df = shark_tank_df.select("Startup_Name", "Pitchers_City", "Pitchers_State","Season_Number")\
    .withColumn("Pitchers_State", explode(split(col("Pitchers_State"), ",")))\
    .groupby(col("Pitchers_State").alias("State"),col("Season_Number").alias("Season_Number"))\
    .agg(count("Startup_Name").alias("cnt_of_starts_up"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .drop("Season_Number")


obj3_pd_df = obj3_df.toPandas()

In [52]:
shark_tank_df.printSchema()

root
 |-- Season_Number: integer (nullable = true)
 |-- Startup_Name: string (nullable = true)
 |-- Episode_Number: integer (nullable = true)
 |-- Pitch_Number: integer (nullable = true)
 |-- Season_Start: string (nullable = true)
 |-- Season_End: string (nullable = true)
 |-- Original_Air_Date: string (nullable = true)
 |-- Episode_Title: string (nullable = true)
 |-- Anchor: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Business_Description: string (nullable = true)
 |-- Company_Website: string (nullable = true)
 |-- Started_in: integer (nullable = true)
 |-- Number_of_Presenters: integer (nullable = true)
 |-- Male_Presenters: integer (nullable = true)
 |-- Female_Presenters: integer (nullable = true)
 |-- Transgender_Presenters: integer (nullable = true)
 |-- Couple_Presenters: integer (nullable = true)
 |-- Pitchers_Average_Age: string (nullable = true)
 |-- Pitchers_City: string (nullable = true)
 |-- Pitchers_State: string (nullable = true)
 |-- Yearly_Re

In [53]:
# Objective5: Year Wise Start Up (Last 3 years)
current_year = 2023
last_3_years = [current_year - i for i in range(3)]
shark_tank_df\
    .select("Started_In","Startup_Name","Industry","Season_Number")\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .drop("Season_Number")\
    .filter(col("Started_In").isin(last_3_years))\
    .groupby("Started_In","Industry","SeasonNumber")\
    .agg(count("Started_In").alias("cnt"))




DataFrame[Started_In: int, Industry: string, SeasonNumber: string, cnt: bigint]

In [54]:
obj4_df = shark_tank_df\
    .select("Male_Presenters","Female_Presenters","Industry","Season_Number")\
    .groupby("Industry","Season_Number")\
    .agg(count("Male_Presenters").alias("Male Entrepreneur"),
         count("Female_Presenters").alias("Female Entrepreneur"))

obj4_pd_df = obj4_df.toPandas()

In [55]:
with pd.ExcelWriter("/Users/sahilnagpal/Desktop/byte-building/SharkTankIndia/results/results.xlsx", engine="openpyxl") as writer:
    obj1_pd_df.to_excel(writer, sheet_name="startups_deal_made_investment", index=False)
    obj2_pd_df.to_excel(writer, sheet_name="industry_wise", index=False)
    obj3_pd_df.to_excel(writer, sheet_name="state_analysis", index=False)
    obj4_pd_df.to_excel(writer, sheet_name="gender", index=False)

In [56]:
namita_df = shark_tank_df\
    .groupby(col("Season_Number"),col("Industry"))\
    .agg(sum("Namita_Investment_Amount").alias("total_amount_spent (In Lakhs)"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .withColumn("SharkName",lit("Namita Thapar"))\
    .withColumn("total_amount_spent (In Lakhs)",round(when(col("total_amount_spent (In Lakhs)").isNull(),0).otherwise(col("total_amount_spent (In Lakhs)")),2))\
    .filter(col("total_amount_spent (In Lakhs)")>0)\
    .drop("Season_Number")

In [57]:
vineeta_df = shark_tank_df\
    .groupby(col("Season_Number"),col("Industry"))\
    .agg(sum("Vineeta_Investment_Amount").alias("total_amount_spent (In Lakhs)"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .withColumn("SharkName",lit("Vineeta Singh"))\
    .withColumn("total_amount_spent (In Lakhs)",round(when(col("total_amount_spent (In Lakhs)").isNull(),0).otherwise(col("total_amount_spent (In Lakhs)")),2))\
    .filter(col("total_amount_spent (In Lakhs)")>0)\
    .drop("Season_Number")

In [58]:
anupam_df = shark_tank_df\
    .groupby(col("Season_Number"),col("Industry"))\
    .agg(sum("Anupam_Investment_Amount").alias("total_amount_spent (In Lakhs)"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .withColumn("SharkName",lit("Anupam Mittal"))\
    .withColumn("total_amount_spent (In Lakhs)",round(when(col("total_amount_spent (In Lakhs)").isNull(),0).otherwise(col("total_amount_spent (In Lakhs)")),2))\
    .filter(col("total_amount_spent (In Lakhs)")>0)\
    .drop("Season_Number")

In [59]:
aman_df = shark_tank_df\
    .groupby(col("Season_Number"),col("Industry"))\
    .agg(sum("Aman_Investment_Amount").alias("total_amount_spent (In Lakhs)"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .withColumn("SharkName",lit("Aman Gupta"))\
    .withColumn("total_amount_spent (In Lakhs)",round(when(col("total_amount_spent (In Lakhs)").isNull(),0).otherwise(col("total_amount_spent (In Lakhs)")),2))\
    .filter(col("total_amount_spent (In Lakhs)")>0)\
    .drop("Season_Number")

In [60]:
peyush_df = shark_tank_df\
    .groupby(col("Season_Number"),col("Industry"))\
    .agg(sum("Peyush_Investment_Amount").alias("total_amount_spent (In Lakhs)"))\
    .withColumn("SeasonNumber", concat_ws(" ", lit("Season"), col("Season_Number")))\
    .withColumn("SharkName",lit("Peeyush Bansal"))\
    .withColumn("total_amount_spent (In Lakhs)",round(when(col("total_amount_spent (In Lakhs)").isNull(),0).otherwise(col("total_amount_spent (In Lakhs)")),2))\
    .filter(col("total_amount_spent (In Lakhs)")>0)\
    .drop("Season_Number")

In [61]:
obj5_df = vineeta_df\
    .unionAll(namita_df)\
    .unionAll(anupam_df)\
    .unionAll(aman_df)\
    .unionAll(peyush_df)\

obj5_pd_df =obj5_df.toPandas()

In [62]:
with pd.ExcelWriter("/Users/sahilnagpal/Desktop/byte-building/SharkTankIndia/results/results.xlsx", engine="openpyxl") as writer:
    obj1_pd_df.to_excel(writer, sheet_name="startups_deal_made_investment", index=False)
    obj2_pd_df.to_excel(writer, sheet_name="industry_wise", index=False)
    obj3_pd_df.to_excel(writer, sheet_name="state_analysis", index=False)
    obj4_pd_df.to_excel(writer, sheet_name="gender", index=False)
    obj5_pd_df.to_excel(writer, sheet_name="sharks", index=False)