In [1]:
"""
Startup Evaluation Engine
------------------------
This script simulates a credit-score-like evaluation for startups using a composite scoring methodology.

Sections:
1. Data Loading & Exploration
2. Data Preprocessing & Normalization
3. Scoring Formula & Feature Weighting
4. Ranking & Interpretation
5. Visualization
6. Documentation & Insights
7. Bonus: ML Extension (KMeans Clustering)
"""

'\nStartup Evaluation Engine\n------------------------\nThis script simulates a credit-score-like evaluation for startups using a composite scoring methodology.\n\nSections:\n1. Data Loading & Exploration\n2. Data Preprocessing & Normalization\n3. Scoring Formula & Feature Weighting\n4. Ranking & Interpretation\n5. Visualization\n6. Documentation & Insights\n7. Bonus: ML Extension (KMeans Clustering)\n'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
# 1. Data Loading & Exploration
# -----------------------------
# Load the dataset into a pandas DataFrame for analysis.
DATA_PATH = '../Startup_Scoring_Dataset.csv' if os.path.exists('../Startup_Scoring_Dataset.csv') else 'Startup_Scoring_Dataset.csv'
df = pd.read_csv(DATA_PATH)
print('First 5 rows:')
print(df.head())

First 5 rows:
  startup_id  team_experience  market_size_million_usd  monthly_active_users  \
0       S001                7                      789                 66299   
1       S002                4                      939                 34866   
2       S003                8                       42                 73630   
3       S004                5                       57                 93657   
4       S005                7                      512                 61187   

   monthly_burn_rate_inr  funds_raised_inr  valuation_inr  
0                6091495           4183839      263247468  
1                9106842            666326      172927107  
2                8817088            649150      313090455  
3                6374167          20153173      305887183  
4                 841971            118834       84815222  


In [4]:
# 2. Data Preprocessing & Normalization
# -------------------------------------
# Normalize all numeric columns to a 0-1 range using Min-Max normalization.
# For negatively correlated metrics (like burn rate), invert the normalization so lower is better.
good_cols = ['team_experience', 'market_size_million_usd', 'monthly_active_users', 'funds_raised_inr', 'valuation_inr']  # Higher is better
bad_cols = ['monthly_burn_rate_inr']  # Higher is worse

In [5]:
# Normalize 'good' columns (higher is better)
for col in good_cols:
    df[col + '_norm'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
# Normalize and invert 'bad' columns (higher is worse)
for col in bad_cols:
    df[col + '_norm'] = 1 - (df[col] - df[col].min()) / (df[col].max() - df[col].min())

In [6]:
print('\nNormalized columns:')
print(df[[c+'_norm' for c in good_cols + bad_cols]].head())


Normalized columns:
   team_experience_norm  market_size_million_usd_norm  \
0              0.666667                      0.789206   
1              0.333333                      0.941955   
2              0.777778                      0.028513   
3              0.444444                      0.043788   
4              0.666667                      0.507128   

   monthly_active_users_norm  funds_raised_inr_norm  valuation_inr_norm  \
0                   0.669162               0.082061            0.518992   
1                   0.347274               0.011052            0.333843   
2                   0.744235               0.010706            0.621166   
3                   0.949320               0.404435            0.606400   
4                   0.616813               0.000000            0.153220   

   monthly_burn_rate_inr_norm  
0                    0.393230  
1                    0.084439  
2                    0.114112  
3                    0.364283  
4                    0.93

In [7]:
# 3. Scoring Formula & Feature Weighting
# --------------------------------------
# Assign weights to each feature based on business logic and perceived impact.
# The sum of weights should be 1.0 (or 100% when scaled).
weights = {
    'team_experience_norm': 0.15,           # Team quality
    'market_size_million_usd_norm': 0.20,   # Market opportunity
    'monthly_active_users_norm': 0.25,      # Traction
    'monthly_burn_rate_inr_norm': 0.10,     # Efficiency (inverted)
    'funds_raised_inr_norm': 0.10,          # Fundraising
    'valuation_inr_norm': 0.20              # Perceived value
}

In [8]:
# Compute the composite score as a weighted sum, scaled to 100.
df['composite_score'] = sum(df[col] * w for col, w in weights.items()) * 100
print('\nComposite scores:')
print(df[['startup_id', 'composite_score']].head())


Composite scores:
  startup_id  composite_score
0       S001        57.645910
1       S002        40.152721
2       S003        44.514294
3       S004        51.090609
4       S005        47.935444


In [9]:
# 4. Ranking & Interpretation
# ---------------------------
# Rank startups by their composite score. Identify top and bottom performers.
df_sorted = df.sort_values('composite_score', ascending=False).reset_index(drop=True)
top10 = df_sorted.head(10)
bottom10 = df_sorted.tail(10)

In [10]:
print('\nTop 10 Startups:')
print(top10[['startup_id', 'composite_score']])
print('\nBottom 10 Startups:')
print(bottom10[['startup_id', 'composite_score']])


Top 10 Startups:
  startup_id  composite_score
0       S006        79.157433
1       S045        76.437558
2       S077        76.391682
3       S033        73.918777
4       S029        73.364384
5       S097        73.091131
6       S013        72.604364
7       S012        69.856259
8       S008        68.773096
9       S043        68.474004

Bottom 10 Startups:
   startup_id  composite_score
90       S051        33.119537
91       S017        32.799997
92       S093        32.231949
93       S079        31.667025
94       S074        30.770841
95       S058        30.250202
96       S098        26.344424
97       S084        25.842748
98       S023        24.231122
99       S055        19.174274


In [11]:
# Example: Print details for the top and bottom scorer for interpretation.
top_row = top10.iloc[0]
bottom_row = bottom10.iloc[0]
print(f"\nTop Scorer: {top_row['startup_id']}\n", top_row)
print(f"\nBottom Scorer: {bottom_row['startup_id']}\n", bottom_row)


Top Scorer: S006
 startup_id                           S006
team_experience                        10
market_size_million_usd               416
monthly_active_users                95939
monthly_burn_rate_inr              551157
funds_raised_inr                 48939582
valuation_inr                   307433345
team_experience_norm                  1.0
market_size_million_usd_norm     0.409369
monthly_active_users_norm        0.972689
funds_raised_inr_norm            0.985549
valuation_inr_norm               0.609569
monthly_burn_rate_inr_norm       0.960596
composite_score                 79.157433
Name: 0, dtype: object

Bottom Scorer: S051
 startup_id                           S051
team_experience                         3
market_size_million_usd               180
monthly_active_users                51763
monthly_burn_rate_inr             3501356
funds_raised_inr                  6095929
valuation_inr                   146815208
team_experience_norm             0.222222
market_size_

In [12]:
# 5. Visualization
# ----------------
# Create output directory if it doesn't exist.
os.makedirs('../outputs', exist_ok=True) if os.path.exists('../outputs') else os.makedirs('outputs', exist_ok=True)
OUTPUT_DIR = '../outputs' if os.path.exists('../outputs') else 'outputs'

In [13]:
# Bar chart: Composite scores for all startups, sorted.
plt.figure(figsize=(14,4))
plt.bar(df_sorted['startup_id'], df_sorted['composite_score'])
plt.title('Startup Composite Scores (Sorted)')
plt.xlabel('Startup ID')
plt.ylabel('Score')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'bar_chart_scores.png'))
plt.close()

In [14]:
# Correlation heatmap: Shows relationships between normalized features.
plt.figure(figsize=(8,6))
sns.heatmap(df[[c+'_norm' for c in good_cols + bad_cols]].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (Normalized Features)')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'correlation_heatmap.png'))
plt.close()

In [15]:
# Score distribution: Histogram of composite scores.
plt.figure(figsize=(8,4))
sns.histplot(df['composite_score'], bins=20, kde=True)
plt.title('Score Distribution')
plt.xlabel('Composite Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'score_distribution.png'))
plt.close()

In [16]:
# 6. Documentation & Insights
# ---------------------------
# Append key insights and methodology to the README for transparency.
with open('README.md', 'a') as f:
    f.write('\n\n## Insights from Analysis (auto-appended)\n')
    f.write('- Weights chosen based on perceived business impact (traction, market, team, etc.).\n')
    f.write('- Burn Rate is inverted so lower burn is better.\n')
    f.write('- See script output and plots in outputs/ for more details.\n')

In [17]:
print(f"\nPlots saved to {OUTPUT_DIR}/.\nDocumentation appended to README.md.")


Plots saved to outputs/.
Documentation appended to README.md.


In [18]:
# 7. Bonus: ML Extension - Clustering Startups with KMeans
# --------------------------------------------------------
# Cluster startups into archetypes using KMeans on normalized features.
# Visualize clusters using PCA for dimensionality reduction.
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [19]:
# Prepare data for clustering (normalized features only)
feature_cols = [c+'_norm' for c in good_cols + bad_cols]
X = df[feature_cols]

In [20]:
# Choose number of clusters (e.g., 3 archetypes)
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X)

In [21]:
# Reduce to 2D for visualization using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
df['pca1'] = X_pca[:,0]
df['pca2'] = X_pca[:,1]

In [22]:
# Plot clusters in PCA-reduced space
plt.figure(figsize=(8,6))
for i in range(n_clusters):
    plt.scatter(df[df['cluster']==i]['pca1'], df[df['cluster']==i]['pca2'], label=f'Cluster {i}')
plt.title('Startup Clusters (KMeans, PCA-reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'kmeans_clusters.png'))
plt.close()

In [23]:
# Show mean feature values for each cluster (archetype profile)
grouped = df.groupby('cluster')[feature_cols + ['composite_score']].mean()
print('\nKMeans Cluster Centers (mean normalized features):')
print(grouped)


KMeans Cluster Centers (mean normalized features):
         team_experience_norm  market_size_million_usd_norm  \
cluster                                                       
0                    0.810700                      0.555970   
1                    0.669591                      0.431022   
2                    0.206349                      0.658278   

         monthly_active_users_norm  funds_raised_inr_norm  valuation_inr_norm  \
cluster                                                                         
0                         0.692417               0.787386            0.487732   
1                         0.428742               0.148298            0.527580   
2                         0.493962               0.526393            0.348438   

         monthly_burn_rate_inr_norm  composite_score  
cluster                                               
0                          0.535097        63.569803  
1                          0.480644        46.223855  
2     

In [24]:
# Append clustering explanation and summary to README
with open('README.md', 'a') as f:
    f.write('\n\n## Bonus: ML Extension - Startup Clustering\n')
    f.write('We applied KMeans clustering (k=3) to the normalized features to identify archetypes among startups.\n')
    f.write('Clusters were visualized using PCA for dimensionality reduction.\n')
    f.write('Cluster centers (mean feature values) reveal typical profiles, e.g., high-growth/low-burn, high-burn/low-traction, etc.\n')
    f.write('See the plot in outputs/kmeans_clusters.png and the summary table below.\n')
    f.write('\n\nCluster Centers (mean normalized features):\n')
    f.write(grouped.to_string())
    f.write('\n')

In [25]:
print(f"\nKMeans clustering complete. Cluster plot saved to {OUTPUT_DIR}/kmeans_clusters.png. Explanation appended to README.md.")


KMeans clustering complete. Cluster plot saved to outputs/kmeans_clusters.png. Explanation appended to README.md.
