In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from wordcloud import WordCloud
import re
import os
from PIL import ImageFont

In [10]:
# Load Data
file_paths = {
    "Finance and Insurance": "/Users/sunidi/Downloads/Startup Failure (Finance and Insurance) (1).csv",
    "Retail Trade": "/Users/sunidi/Downloads/Cleaned_Retail_Startup_Failure.csv",
    "Food and Services": "/Users/sunidi/Downloads/Startup Failure (Food and services) (1).csv",
    "Manufactures": "/Users/sunidi/Downloads/Startup Failure (Manufactures) (1).csv",
    "Information Sector": "/Users/sunidi/Downloads/Startup Failures (Information Sector) (1).csv",
    "Health Sector": "/Users/sunidi/Downloads/Startup Failure (Health Care) (1).csv"
}

valid_dataframes = {}
for key, path in file_paths.items():
    df = pd.read_csv(path, on_bad_lines='warn')
    df["Source_Sector"] = key
    valid_dataframes[key] = df

combined_df = pd.concat(valid_dataframes.values(), ignore_index=True)

In [3]:
def clean_funding(val):
    if isinstance(val, str):
        val = re.sub(r'\([^)]*\)', '', val)
        val = val.upper().replace('$','').replace(',','').strip()
        match = re.search(r'\d+(\.\d+)?', val)
        if not match:
            return np.nan
        num = float(match.group())
        if 'M' in val:
            return num * 1e6
        elif 'B' in val:
            return num * 1e9
        else:
            return num
    return np.nan

combined_df['Funding Amount'] = combined_df['How Much They Raised'].apply(clean_funding)

def extract_years_safe(op_string):
    try:
        if isinstance(op_string, str):
            op_string = re.sub(r'\([^)]*\)', '', op_string)
            parts = re.findall(r'\d{4}', op_string)
            if len(parts) >= 2:
                return int(parts[0]), int(parts[1])
    except:
        pass
    return np.nan, np.nan

combined_df[['Start Year', 'End Year']] = combined_df['Years of Operation'].apply(
    lambda x: pd.Series(extract_years_safe(x))
)

combined_df = combined_df.loc[:, ~combined_df.columns.str.contains('^Unnamed')]
binary_cols = [col for col in combined_df.select_dtypes(include=[np.number]).columns if col != 'Overhype']
combined_df[binary_cols] = combined_df[binary_cols].fillna(0)

In [4]:
# Feature Engineering
model_df = combined_df.dropna(subset=['Overhype'])
model_df['Funding Bucket'] = pd.cut(model_df['Funding Amount'], bins=[-1, 1e6, 1e7, 1e8, 1e9, np.inf], labels=['<1M', '1M-10M', '10M-100M', '100M-1B', '>1B'])
x_numeric_base = ['Start Year', 'End Year'] + [col for col in binary_cols if col in model_df.columns and col not in ['Funding Amount']]
x_numeric = list(dict.fromkeys(x_numeric_base))
x_categorical = ['Funding Bucket']
features = x_categorical + x_numeric
y = (model_df['Overhype'] > 0.5).astype(int)
X = model_df[features]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first', sparse_output=False), x_categorical),
    ('num', StandardScaler(), x_numeric)
])

X_processed = preprocessor.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['Funding Bucket'] = pd.cut(model_df['Funding Amount'], bins=[-1, 1e6, 1e7, 1e8, 1e9, np.inf], labels=['<1M', '1M-10M', '10M-100M', '100M-1B', '>1B'])


In [12]:
from sklearn.model_selection import train_test_split

# Increase test size to 0.4 (40%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)


In [13]:
# If you're dropping rows with missing Overhype or other features:
df = df.dropna(subset=['Overhype'])

# Make sure you know how many rows you're left with:
print(df.shape)


(60, 21)


In [5]:
# Step: Handling Class Imbalance with SMOTE and Evaluating Models
 
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
 
# Ensure X_processed and y are already defined in your notebook before this block
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42, stratify=y)
 
# Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
 
# Define weighted classifiers
models = {
    "Random Forest (weighted)": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "Logistic Regression (weighted)": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree (weighted)": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}
 
# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train_bal, y_train_bal)
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    cr = classification_report(y_test, preds)
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(cr)
    results[name] = {
        "Accuracy": accuracy_score(y_test, preds),
        "Confusion Matrix": cm,
        "Classification Report": classification_report(y_test, preds, output_dict=True)
    }
 
# Summarize performance
summary_df = pd.DataFrame({
    model: {
        "Accuracy": round(metrics["Accuracy"], 3),
        "Recall (Class 1)": round(metrics["Classification Report"].get("1", {}).get("recall", np.nan), 3),
        "Precision (Class 1)": round(metrics["Classification Report"].get("1", {}).get("precision", np.nan), 3)
    }
    for model, metrics in results.items()
}).T
 
print("\nModel Evaluation Summary:")
print(summary_df)




Model: Random Forest (weighted)
Confusion Matrix:
[[65  2]
 [ 4  1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96        67
           1       0.33      0.20      0.25         5

    accuracy                           0.92        72
   macro avg       0.64      0.59      0.60        72
weighted avg       0.90      0.92      0.91        72


Model: Logistic Regression (weighted)
Confusion Matrix:
[[60  7]
 [ 3  2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.92        67
           1       0.22      0.40      0.29         5

    accuracy                           0.86        72
   macro avg       0.59      0.65      0.60        72
weighted avg       0.90      0.86      0.88        72


Model: Decision Tree (weighted)
Confusion Matrix:
[[60  7]
 [ 4  1]]
Classification Report:
              precision    recall  f1-score   support

      

In [21]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np

# ✅ Ensure consistent shapes
print("Initial shape check:")
print(f"X_processed: {X_processed.shape}, y: {len(y)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

# ✅ Apply SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print(f"After SMOTE - X_train_bal: {X_train_bal.shape}, y_train_bal: {len(y_train_bal)}")

# Classifier setup
models = {
    "Random Forest (weighted)": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "Logistic Regression (weighted)": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree (weighted)": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train + Evaluate
results = {}
for name, model in models.items():
    print(f"\n🔍 Model: {name}")
    model.fit(X_train_bal, y_train_bal)
    preds = model.predict(X_test)

    # ✅ Correct usage of test set labels
    cm = confusion_matrix(y_test, preds)
    cr = classification_report(y_test, preds)

    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(cr)

    results[name] = {
        "Accuracy": accuracy_score(y_test, preds),
        "Confusion Matrix": cm,
        "Classification Report": classification_report(y_test, preds, output_dict=True)
    }

# Summary Table
summary_df = pd.DataFrame({
    model: {
        "Accuracy": round(metrics["Accuracy"], 3),
        "Recall (Class 1)": round(metrics["Classification Report"].get("1", {}).get("recall", np.nan), 3),
        "Precision (Class 1)": round(metrics["Classification Report"].get("1", {}).get("precision", np.nan), 3)
    }
    for model, metrics in results.items()
}).T

print("\n📊 Model Evaluation Summary:")
print(summary_df)


Initial shape check:
X_processed: (238, 17), y: 238
Train size: 166, Test size: 72
After SMOTE - X_train_bal: (310, 17), y_train_bal: 310

🔍 Model: Random Forest (weighted)
Confusion Matrix:
[[65  2]
 [ 4  1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96        67
           1       0.33      0.20      0.25         5

    accuracy                           0.92        72
   macro avg       0.64      0.59      0.60        72
weighted avg       0.90      0.92      0.91        72


🔍 Model: Logistic Regression (weighted)
Confusion Matrix:
[[60  7]
 [ 3  2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.92        67
           1       0.22      0.40      0.29         5

    accuracy                           0.86        72
   macro avg       0.59      0.65      0.60        72
weighted avg       0.90      0.86      0.88        72


🔍 Model: De



Confusion Matrix:
[[66  1]
 [ 4  1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        67
           1       0.50      0.20      0.29         5

    accuracy                           0.93        72
   macro avg       0.72      0.59      0.62        72
weighted avg       0.91      0.93      0.92        72


📊 Model Evaluation Summary:
                                Accuracy  Recall (Class 1)  \
Random Forest (weighted)           0.917               0.2   
Logistic Regression (weighted)     0.861               0.4   
Decision Tree (weighted)           0.847               0.2   
Gradient Boosting                  0.931               0.2   

                                Precision (Class 1)  
Random Forest (weighted)                      0.333  
Logistic Regression (weighted)                0.222  
Decision Tree (weighted)                      0.125  
Gradient Boosting                             0.500  


zsh:1: command not found: pip
