# Predição de Sucesso de Startups

Este notebook tem como objetivo explorar e modelar dados reais de startups para prever se uma empresa terá sucesso (ativa/adquirida) ou insucesso (fechada).

Inclui: análise exploratória, tratamento de dados, seleção de variáveis, modelagem e avaliação de resultados.

## 1. Importação de Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


## 2. Carregamento e pré-visualização dos dados

In [None]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
print("Dados de treino carregados com sucesso. Shape: {}".format(train_df.shape))
print("Dados de teste carregados com sucesso. Shape: {}".format(test_df.shape))

print(train_df.head(5))
print(train_df.info())
print(train_df.describe())
print(train_df.isnull().sum()[train_df.isnull().sum() > 0])
print(train_df["labels"].value_counts())
print(train_df["labels"].value_counts(normalize=True) * 100)


## 3. Criação de novas features

In [None]:
def create_features(df):
    df['mean_funding_age'] = (df['age_first_funding_year'] + df['age_last_funding_year']) / 2
    df['milestone_duration'] = df['age_last_milestone_year'] - df['age_first_milestone_year']
    df['milestone_duration'].fillna(0, inplace=True)
    df['milestones_per_round'] = df['milestones'] / df['funding_rounds'].replace(0, np.nan)
    df['milestones_per_round'].fillna(0, inplace=True)
    rounds_flags = ['has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD']
    df['total_round_flags'] = df[rounds_flags].sum(axis=1)
    df['relationships_per_round'] = df['relationships'] / df['funding_rounds'].replace(0, np.nan)
    df['relationships_per_round'].fillna(0, inplace=True)
    location_flags = ['is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate']
    df['total_location_flags'] = df[location_flags].sum(axis=1)
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)


## 4. Tratamento de valores ausentes

In [None]:
cols_to_impute_median = [
    'age_first_funding_year', 'age_last_funding_year',
    'age_first_milestone_year', 'age_last_milestone_year',
    'funding_total_usd',
    'mean_funding_age', 'milestone_duration',
    'milestones_per_round', 'total_round_flags',
    'relationships_per_round', 'total_location_flags'
]

for col in cols_to_impute_median:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)


## 5. Codificação de variáveis categóricas e alinhamento entre treinos e teste

In [None]:
categorical_col = ["category_code"]
train_df = pd.get_dummies(train_df, columns=categorical_col, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_col, drop_first=True)

missing_cols_in_test = set(train_df.columns) - set(test_df.columns)
missing_cols_in_train = set(test_df.columns) - set(train_df.columns)
for col in missing_cols_in_test:
    if col != 'labels':
        test_df[col] = 0
for col in missing_cols_in_train:
    if col != 'labels':
        train_df[col] = 0


## 6. Separação de features e target e padronização

In [None]:
X_train_full = train_df.drop("labels", axis=1)
y_train_full = train_df["labels"]
X_test = test_df.copy()

numerical_cols = [
    col for col in X_train_full.columns
    if X_train_full[col].dtype in ["int64", "float64"]
    and not col.startswith("is_")
    and not col.startswith("category_code_")
]

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2,
    stratify=y_train_full,
    random_state=42)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val_scaled[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])


## 7. Seleção de features e modelo Random Forest com RandomizedSearchCV

In [None]:
selector = SelectFromModel(
    LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=1000, random_state=42),
    threshold='mean'
)
selector.fit(X_train_scaled, y_train)

X_train_sel = selector.transform(X_train_scaled)
X_val_sel = selector.transform(X_val_scaled)
X_test_aligned = X_test_scaled[X_train_scaled.columns]
X_test_sel = selector.transform(X_test_aligned)

rf = RandomForestClassifier(class_weight='balanced', random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True, False]
}

random_search_rf = RandomizedSearchCV(
    rf,
    param_dist,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
random_search_rf.fit(X_train_sel, y_train)

print(f'Melhor estimador Random Forest: {random_search_rf.best_params_}')


## 8. Avaliação do modelo com métricas completas e matriz de confusão

In [None]:
from sklearn.metrics import confusion_matrix
y_val_pred = random_search_rf.predict(X_val_sel)
print("Relatório de classificação no conjunto de validação:")
print(classification_report(y_val, y_val_pred))
print(f"Acurácia no conjunto de validação: {accuracy_score(y_val, y_val_pred):.4f}")
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predito')
plt.ylabel('Verdadeiro')
plt.title('Matriz de Confusão')
plt.show()


## 9. Validação cruzada para avaliação robusta

In [None]:
cv_scores = cross_val_score(random_search_rf.best_estimator_, X_train_sel, y_train, cv=5, scoring='accuracy')
print(f'Validação cruzada (5-fold) - Acurácia média: {cv_scores.mean():.4f} +- {cv_scores.std():.4f}')


## 10. Treinamento final e submissão

In [None]:
best_model = random_search_rf.best_estimator_
best_model.fit(selector.transform(X_train_full), y_train_full)
submission_preds = best_model.predict(X_test_sel)
submission_df = pd.DataFrame({'id': test_df['id'], 'labels': submission_preds})
submission_df.to_csv('../data/submission.csv', index=False)
print("Arquivo de submissão salvo em '../data/submission.csv'")
