In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib

# Load dataset
df = pd.read_csv('cleaned_final_dataset.csv')

# Features and target
X = df.drop(columns=['success_ratio'])
y = df['success_ratio']

# Define categorical and numerical columns
categorical_features = ['category_code']
numerical_features = [col for col in X.columns if col != 'category_code']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])

# RandomForest with balanced class weights to handle imbalance
rf_classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

print(X.head())
# Full pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict probabilities and classes
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Evaluate model with multiple metrics
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"✅ ROC-AUC Score   : {auc:.4f}")
print(f"✅ Accuracy       : {acc:.4f}")
print(f"✅ Precision      : {prec:.4f}")
print(f"✅ Recall         : {rec:.4f}")
print(f"✅ F1 Score       : {f1:.4f}")

# Save the pipeline
joblib.dump(model, 'rf_success_model_balanced.pkl')
print("✅ Pipeline and model saved to 'rf_success_model_balanced.pkl'")


  category_code  funding_required  founded_year  problem_importance  \
0         music            375000          2007                   8   
1    enterprise          40100000          2000                   9   
2           web           2600000          2009                   7   
3      software          40000000          2002                   9   
4   games_video           1300000          2010                   9   

   solution_uniqueness  usp_strength  business_model_clarity  \
0                    5             8                       8   
1                    5             9                       7   
2                   10             9                       7   
3                   10             9                       6   
4                    7             7                       5   

   target_market_size  team_experience_years  equity_offered  
0                  27                     12            6.40  
1                   2                      5           16.09  

In [7]:
# Robust prediction function with adjustable threshold
def predict_startup_success_custom(sample_dict, threshold=0.4):
    sample_df = pd.DataFrame([sample_dict])
    
    # Add any missing columns with 0 (or np.nan if you want)
    missing_cols = set(X.columns) - set(sample_df.columns)
    for col in missing_cols:
        sample_df[col] = 0
    
    # Reorder columns to match training data
    sample_df = sample_df[X.columns]
    
    prob_success = model.predict_proba(sample_df)[0][1]
    pred_class = int(prob_success >= threshold)
    
    return pred_class, prob_success

# Sample startup test with threshold 0.4
sample_startup = {
    'category_code': 'health',
    'founded_year': 2020,
    'problem_importance': 8,
    'solution_uniqueness': 9,
    'usp_strength': 7,
    'business_model_clarity': 8,
    'team_experience_years': 5,
    'equity_offered': 20,
    'funding_required_log': np.log1p(1_000_000),
    'target_market_size_log': np.log1p(50_000_000),
    'funding_per_year': 1_000_000 / (2025 - 2020),
    'problem_solution_score': 8 * 9
}

pred_class, prob = predict_startup_success_custom(sample_startup, threshold=0.4)

print("\n🧪 Sample Startup Prediction:")
print(f"➡️ Success probability: {prob*100:.2f}%")
print(f"➡️ Predicted class     : {pred_class}  (0 = Fail, 1 = Success)")



🧪 Sample Startup Prediction:
➡️ Success probability: 50.00%
➡️ Predicted class     : 1  (0 = Fail, 1 = Success)


In [50]:
print("\n🧪 Batch Prediction on 10 Sample Startups (Threshold = 0.4):")
sample_threshold = 0.4

# Take 10 samples from the test set
sample_10_df = X_test.sample(3, random_state=42)

for idx, row in sample_10_df.iterrows():
    sample_dict = row.to_dict()
    pred_class, prob = predict_startup_success_custom(sample_dict, threshold=sample_threshold)
    print(f"\n🔹 Sample {idx}")
    print(f"➡️ Success probability: {prob*100:.2f}%")
    print(f"➡️ Predicted class     : {pred_class}  (0 = Fail, 1 = Success)")


🧪 Batch Prediction on 10 Sample Startups (Threshold = 0.4):

🔹 Sample 158
➡️ Success probability: 71.00%
➡️ Predicted class     : 1  (0 = Fail, 1 = Success)

🔹 Sample 439
➡️ Success probability: 71.00%
➡️ Predicted class     : 1  (0 = Fail, 1 = Success)

🔹 Sample 912
➡️ Success probability: 85.00%
➡️ Predicted class     : 1  (0 = Fail, 1 = Success)
