In [None]:

import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline

%matplotlib inline

# Configura√ß√£o de paths
current_dir = Path.cwd()
if (current_dir / 'src').exists():
    project_root = current_dir
elif (current_dir.parent / 'src').exists():
    project_root = current_dir.parent
else:
    project_root = current_dir

src_path = project_root / 'src'
data_path = project_root / 'data'
results_path = project_root / 'results'

data_path.mkdir(exist_ok=True)
results_path.mkdir(exist_ok=True)

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(f"Ambiente configurado:")
print(f"    Project Root: {project_root}")
print(f"   Data Path: {data_path}")

In [None]:
print(" Carregando m√≥dulos...")

# Vers√£o simplificada do preprocessor
class SimplePreprocessor:
    def __init__(self):
        self.scaler_x = StandardScaler()
        self.scaler_y = StandardScaler()
    
    def prepare_data(self, df, target_column):
        """Prepara dados para treinamento"""
        X = df.drop(columns=[target_column])
        y = df[target_column]
        
        # Se m√∫ltiplas features, usa apenas a primeira
        if X.shape[1] > 1:
            print(f"  Usando apenas a primeira feature: {X.columns[0]}")
            X = X.iloc[:, 0:1]
        
        # Divide os dados
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        return X_train, X_test, y_train, y_test

# Classe de modelos de regress√£o
class RegressionManager:
    def __init__(self):
        self.models = {}
        self.results = {}
    
    def train_linear(self, X_train, X_test, y_train, y_test):
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        metrics_train = self._calculate_metrics(y_train, y_pred_train)
        metrics_test = self._calculate_metrics(y_test, y_pred_test)
        
        self.models['Linear'] = model
        self.results['Linear'] = {
            'predictions': {'train': y_pred_train, 'test': y_pred_test},
            'metrics': {'train': metrics_train, 'test': metrics_test}
        }
        
        print(f" Linear - R¬≤: {metrics_test['R¬≤']:.4f}")
        return model
    
    def train_polynomial(self, X_train, X_test, y_train, y_test, degree=2):
        model = Pipeline([
            ('poly', PolynomialFeatures(degree=degree)),
            ('linear', LinearRegression())
        ])
        model.fit(X_train, y_train)
        
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
                                                                                                                                                                                                                                                                                                                                                                                                                    
        metrics_train = self._calculate_metrics(y_train, y_pred_train)
        metrics_test = self._calculate_metrics(y_test, y_pred_test)
        
        model_name = f'Poly_{degree}'
        self.models[model_name] = model
        self.results[model_name] = {
            'predictions': {'train': y_pred_train, 'test': y_pred_test},
            'metrics': {'train': metrics_train, 'test': metrics_test}
        }
        
        print(f" Poly_{degree} - R¬≤: {metrics_test['R¬≤']:.4f}")
        return model
    
    def _calculate_metrics(self, y_true, y_pred):
        return {
            'MSE': mean_squared_error(y_true, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
            'MAE': mean_absolute_error(y_true, y_pred),
            'R¬≤': r2_score(y_true, y_pred)
        }
    
    def compare_models(self):
        comparison = []
        for name, result in self.results.items():
            comparison.append({
                'Modelo': name,
                'R¬≤_Treino': result['metrics']['train']['R¬≤'],
                'R¬≤_Teste': result['metrics']['test']['R¬≤'],
                'MSE_Teste': result['metrics']['test']['MSE']
            })
        return pd.DataFrame(comparison)

print("‚úÖ M√≥dulos carregados!")

In [None]:
print(" CARREGAMENTO AUTOM√ÅTICO DE DADOS")

# Encontra arquivos CSV
csv_files = list(data_path.glob("*.csv"))
if not csv_files:
    print(" Nenhum arquivo CSV encontrado na pasta data/!")
    print(" Coloque seu arquivo CSV na pasta data/ e reinicie o notebook")
else:
    print(" Arquivos encontrados:")
    for i, f in enumerate(csv_files):
        print(f"   {i+1}. {f.name}")
    
    # Usa o primeiro arquivo CSV
    DATA_FILE = csv_files[0].name
    print(f"\n Usando arquivo: {DATA_FILE}")
    
    # Carrega dados
    df = pd.read_csv(data_path / DATA_FILE)
    print(f" Dados carregados: {df.shape[0]} linhas, {df.shape[1]} colunas")
    
    # Detecta coluna target
    possible_targets = ['target', 'y', 'result', 'output', 'value']
    TARGET_COLUMN = None
    
    for col in possible_targets:
        if col in df.columns:
            TARGET_COLUMN = col
            break
    
    if TARGET_COLUMN is None:
        TARGET_COLUMN = df.columns[-1]  # Usa √∫ltima coluna
    
    print(f" Vari√°vel target: {TARGET_COLUMN}")
    
    # Mostra informa√ß√µes
    print("\n Primeiras linhas:")
    display(df.head())
    
    print("\n Estat√≠sticas:")
    display(df.describe())

In [None]:
print("üîß PR√â-PROCESSAMENTO")

preprocessor = SimplePreprocessor()
X_train, X_test, y_train, y_test = preprocessor.prepare_data(df, TARGET_COLUMN)

print(f" Dados de treino: {X_train.shape}")
print(f" Dados de teste: {X_test.shape}")
print(f" Feature usada: {X_train.columns[0]}")

In [None]:
print("üß† TREINAMENTO DOS MODELOS")

regressor = RegressionManager()

# Treina modelos
linear_model = regressor.train_linear(X_train, X_test, y_train, y_test)
poly2_model = regressor.train_polynomial(X_train, X_test, y_train, y_test, degree=2)
poly3_model = regressor.train_polynomial(X_train, X_test, y_train, y_test, degree=3)

print("\n‚úÖ Todos os modelos treinados!")

In [None]:
print("üìä COMPARA√á√ÉO DE MODELOS")

comparison_df = regressor.compare_models()
display(comparison_df)

# Melhor modelo
best_idx = comparison_df['R¬≤_Teste'].idxmax()
best_model = comparison_df.loc[best_idx, 'Modelo']
best_r2 = comparison_df.loc[best_idx, 'R¬≤_Teste']

print(f"\n MELHOR MODELO: {best_model} (R¬≤ = {best_r2:.4f})")

# Gr√°fico de compara√ß√£o
plt.figure(figsize=(10, 6))
models = comparison_df['Modelo']
r2_scores = comparison_df['R¬≤_Teste']

bars = plt.bar(models, r2_scores, color=['#2E86AB', '#A23B72', '#F18F01'])
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Compara√ß√£o de Modelos - R¬≤ Score')
plt.ylabel('R¬≤ Score')
plt.xticks(rotation=45)

for bar, value in zip(bars, r2_scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
            f'{value:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
print(" VISUALIZA√á√ÉO DAS REGRESS√ïES")

feature_name = X_train.columns[0]

for model_name, result in regressor.results.items():
    plt.figure(figsize=(12, 4))
    
    # Plot treino
    plt.subplot(1, 2, 1)
    plt.scatter(X_train, y_train, alpha=0.6, label='Dados Treino')
    
    # Linha da regress√£o
    x_line = np.linspace(X_train.min(), X_train.max(), 100).reshape(-1, 1)
    y_line = regressor.models[model_name].predict(x_line)
    plt.plot(x_line, y_line, 'r-', linewidth=2, label='Regress√£o')
    
    plt.title(f'{model_name} - Treino\nR¬≤: {result["metrics"]["train"]["R¬≤"]:.4f}')
    plt.xlabel(feature_name)
    plt.ylabel(TARGET_COLUMN)
    plt.legend()
    
    # Plot teste
    plt.subplot(1, 2, 2)
    plt.scatter(X_test, y_test, alpha=0.6, label='Dados Teste')
    plt.scatter(X_test, result['predictions']['test'], color='red', alpha=0.8, label='Predi√ß√µes')
    plt.title(f'{model_name} - Teste\nR¬≤: {result["metrics"]["test"]["R¬≤"]:.4f}')
    plt.xlabel(feature_name)
    plt.ylabel(TARGET_COLUMN)
    plt.legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
print(" AN√ÅLISE DE C√ÅLCULO")

# Prepara dados ordenados
analysis_df = df.sort_values(by=feature_name)
x_data = analysis_df[feature_name].values
y_data = analysis_df[TARGET_COLUMN].values

# C√°lculos
primeira_derivada = np.gradient(y_data, x_data)
segunda_derivada = np.gradient(primeira_derivada, x_data)
area_total = np.trapz(y_data, x_data)

print(f" Primeira derivada calculada")
print(f" Segunda derivada calculada") 
print(f" √Årea sob a curva: {area_total:.4f}")

# Visualiza√ß√£o
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Dados originais
axes[0,0].plot(x_data, y_data, 'o-', color='#2E86AB', alpha=0.7)
axes[0,0].set_title(f'Dados Originais\n{feature_name} vs {TARGET_COLUMN}')
axes[0,0].set_ylabel(TARGET_COLUMN)
axes[0,0].grid(True, alpha=0.3)

# Primeira derivada
axes[0,1].plot(x_data, primeira_derivada, 'o-', color='#A23B72', alpha=0.7)
axes[0,1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[0,1].set_title('Primeira Derivada (Taxa de Varia√ß√£o)')
axes[0,1].set_ylabel('dy/dx')
axes[0,1].grid(True, alpha=0.3)

# Segunda derivada
axes[1,0].plot(x_data, segunda_derivada, 'o-', color='#F18F01', alpha=0.7)
axes[1,0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1,0].set_title('Segunda Derivada (Concavidade)')
axes[1,0].set_xlabel(feature_name)
axes[1,0].set_ylabel('d¬≤y/dx¬≤')
axes[1,0].grid(True, alpha=0.3)

# √Årea sob a curva
axes[1,1].fill_between(x_data, y_data, alpha=0.3, color='#1B998B')
axes[1,1].plot(x_data, y_data, 'o-', color='#1B998B', alpha=0.7)
axes[1,1].set_title(f'√Årea sob a curva: {area_total:.4f}')
axes[1,1].set_xlabel(feature_name)
axes[1,1].set_ylabel(TARGET_COLUMN)
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
print("="*50)
print(" RELAT√ìRIO FINAL")
print("="*50)

print(f"\n DADOS:")
print(f"   - Dataset: {DATA_FILE}")
print(f"   - Amostras: {len(df)}")
print(f"   - Features: {len(df.columns) - 1}")
print(f"   - Target: {TARGET_COLUMN}")

print(f"\n MODELOS:")
for idx, row in comparison_df.iterrows():
    status = "‚úÖ BOM" if row['R¬≤_Teste'] > 0.7 else "‚ö†Ô∏è  MODERADO" if row['R¬≤_Teste'] > 0.4 else "‚ùå BAIXO"
    print(f"   - {row['Modelo']}: R¬≤ = {row['R¬≤_Teste']:.4f} ({status})")

print(f"\n RESULTADO:")
print(f"   - Melhor modelo: {best_model}")
print(f"   - R¬≤ score: {best_r2:.4f}")
print(f"   - Explica {best_r2*100:.1f}% da vari√¢ncia")

print(f"\n AN√ÅLISE DE C√ÅLCULO:")
print(f"   - √Årea total sob a curva: {area_total:.4f}")
print(f"   - Feature analisada: {feature_name}")

