In [2]:
# Let's first load the dataset and inspect it to understand the structure.
import pandas as pd

# Load the dataset
file_path = (r'C:\Users\TUF GAMING\Documents\main_dataset.csv')
data = pd.read_csv(file_path)

# Display the first few rows to understand the structure of the dataset
data.head()

Unnamed: 0,STARTUP NAME,INDUSTRY,FOUNDING YEAR,UNICORN ENTRY YEAR,PROFIT/LOSS FY22,CURRENT VALUATION,ACQUISITIONS,STATUS
0,Perfios,SaaS,2008,2024,$0.94 Million,$1 Billion,3.0,Private
1,Zepto,E-commerce,2021,2023,-$47.1 Million,$1.4 Billion,,Private
2,Molbio Diagnostics,HealthTech,2010,2022,,$1.5 Billion,1.0,Private
3,Tata 1mg,HealthTech,2015,2022,-$65 Million,$1.25 Billion,,Acquired
4,Shiprocket,Logistics,2017,2022,-$11.67 Million,$1.3 Billion,5.0,Private


In [3]:
# Preprocess the data

# 1. Clean the 'CURRENT VALUATION' column to remove symbols and convert it to numeric
data['CURRENT VALUATION'] = data['CURRENT VALUATION'].replace({'\$': '', ' Billion': '', ' Million': ''}, regex=True)
data['CURRENT VALUATION'] = pd.to_numeric(data['CURRENT VALUATION'], errors='coerce')

# 2. Create the 'SUCCESS' column, where success is defined as valuation > $1 billion
data['SUCCESS'] = (data['CURRENT VALUATION'] > 1).astype(int)

# 3. Fill missing values in 'ACQUISITIONS' with 0
data['ACQUISITIONS'].fillna(0, inplace=True)

# 4. Normalize industries with higher survival rates by giving them a slight boost in success rate
# Industries with higher survival rate: E-Commerce, Foodtech, IT and Gaming, and Fintech
boosted_sectors = ['E-commerce', 'Foodtech', 'IT and Gaming', 'Fintech']
data['INDUSTRY_SUCCESS_BOOST'] = data['INDUSTRY'].apply(lambda x: 1.2 if x in boosted_sectors else 1)

# Define features and the target variable
X = data[['FOUNDING YEAR', 'UNICORN ENTRY YEAR', 'ACQUISITIONS', 'INDUSTRY_SUCCESS_BOOST']]
y = data['SUCCESS']

# Import the RandomForestClassifier and train the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Make predictions on the test set and calculate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy_percentage = accuracy * 100
accuracy_percentage

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ACQUISITIONS'].fillna(0, inplace=True)


73.33333333333333

In [12]:
# Define a class for predicting the success and failure rates of startups
class StartupSurvivalPredictor:
    def __init__(self, model, valid_sectors, boosted_sectors):
        self.model = model
        self.valid_sectors = valid_sectors
        self.boosted_sectors = boosted_sectors

    def predict_survival_rate(self, company_name, sector, founding_year, entry_year, valuation):
        # Normalize input sector
        sector_normalized = sector.strip()

        # Check if sector is recognized
        if sector_normalized not in self.valid_sectors:
            print("Sector not recognized. Please choose from the following:")
            print(self.valid_sectors)
            return

        # Preprocess the input sector for survival rate boost
        sector_boost = 1.2 if sector_normalized in self.boosted_sectors else 1

        # Create a DataFrame for input
        input_data = pd.DataFrame({
            'FOUNDING YEAR': [founding_year],
            'UNICORN ENTRY YEAR': [entry_year],
            'ACQUISITIONS': [0],  # Assume no acquisitions for new startups; adjust based on input if needed
            'INDUSTRY_SUCCESS_BOOST': [sector_boost]
        })

        # Make prediction and get probability of success and failure
        prediction_proba = self.model.predict_proba(input_data)

        # Success and failure rates as percentages
        success_rate_percentage = prediction_proba[0][1] * 100
        failure_rate_percentage = prediction_proba[0][0] * 100

        # Print the results
        print(f"The startup '{company_name}' in the '{sector_normalized}' sector has:")
        print(f"- Success rate: {success_rate_percentage:.2f}%")
        print(f"- Failure rate: {failure_rate_percentage:.2f}%")

# Instantiate the predictor with the trained model and valid sectors
valid_sectors = data['INDUSTRY'].unique()
boosted_sectors = ['E-commerce', 'Foodtech', 'IT and Gaming', 'Fintech']
predictor = StartupSurvivalPredictor(model, valid_sectors, boosted_sectors)

# Example usage: Predict for a sample startup (input values can be changed)
company_name = "Example Startup"
sector = "Fintech"
founding_year = 2024
entry_year = 2027
valuation = "$5 Billion"  # Valuation input can be handled separately if needed

# Predict survival rate for this sample startup
predictor.predict_survival_rate(company_name, sector, founding_year, entry_year, valuation)

The startup 'Example Startup' in the 'Fintech' sector has:
- Success rate: 91.00%
- Failure rate: 9.00%
