<a href="https://colab.research.google.com/github/tulaycetin/Eksik_Versi_Temizleme/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import kagglehub
import pandas as pd
from scipy.optimize import minimize
from sklearn.experimental import enable_iterative_imputer  # IterativeImputer'ı etkinleştirir.
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             confusion_matrix, accuracy_score, classification_report)
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture



# Veri Setini İndirme ve Ön İnceleme
# ---------------------------
# Kaggle veri setini indiriyoruz.
path = kagglehub.dataset_download("maxhorowitz/nflplaybyplay2009to2016")
print("Path to dataset files:", path)


# CSV dosyasını okuyoruz.
df = pd.read_csv(path + '/NFL Play by Play 2009-2017 (v4).csv')
df = df.sample(n=10000, random_state=42)
# -------------------------
# 1. Maximum Likelihood (MLE) ile Imputation
# -------------------------
def mle_imputation(df):
    df_copy = df.copy()
    # Örnek veri: normal dağılımdan 100 örnek
    np.random.seed(42)
    sample_data = np.random.normal(loc=5, scale=2, size=100)

    def neg_log_likelihood(params, data):
        mu, sigma = params[0], params[1]
        if sigma <= 0:
            return np.inf  # sigma negatif olamaz
        n = len(data)
        ll = -0.5 * n * np.log(2 * np.pi) - n * np.log(sigma) - np.sum((data - mu)**2) / (2 * sigma**2)
        return -ll

    initial_params = [np.mean(sample_data), np.std(sample_data, ddof=0)]
    result = minimize(neg_log_likelihood, initial_params, args=(sample_data,))
    mu_MLE, sigma_MLE = result.x
    print("MLE ile tahmin edilen mu:", mu_MLE)
    print("MLE ile tahmin edilen sigma:", sigma_MLE)

    # Her hücre için imputation: eksikse MLE ile elde edilen dağılımdan değer örnekle
    def impute_value(val):
        if pd.isna(val):
            return np.random.normal(mu_MLE, sigma_MLE)
        return val

    df_imputed = df_copy.applymap(impute_value)
    print("MLE imputasyon sonrası eksik değer sayısı:")
    print(df_imputed.isnull().sum())
    return df_imputed

# -------------------------
# 2. MICE (IterativeImputer) ile Imputation
# -------------------------
def mice_imputation(df):
    # Sadece sayısal sütunları kullanıyoruz
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df_numeric = df[numeric_cols]

    imputer = IterativeImputer(random_state=42)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_cols)

    print("\nMICE sonrası ilk 5 satır:")
    print(df_imputed.head())
    return df_imputed

# -------------------------
# 3. EM Algoritması (GaussianMixture) ile Imputation
# -------------------------
def em_imputation(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df_numeric = df[numeric_cols].copy()

    # İlk adım: eksik değerleri sütun ortalamalarıyla dolduruyoruz
    simple_imputer = SimpleImputer(strategy="mean")
    data_imputed = simple_imputer.fit_transform(df_numeric)

    # Gaussian Mixture Model (tek bileşenli; temelde normal dağılım) ile modelleme
    gmm = GaussianMixture(n_components=1, max_iter=50, random_state=42)
    gmm.fit(data_imputed)

    # Eksik değerleri, GMM'den örnekleyerek dolduruyoruz
    missing_rows, missing_cols = np.where(np.isnan(df_numeric))
    for row, col in zip(missing_rows, missing_cols):
        sample = gmm.sample(1)[0][0, col]
        df_numeric.iloc[row, col] = sample
    return df_numeric

# -------------------------
# Yardımcı Fonksiyonlar
# -------------------------
def drop_high_cardinality(df, threshold=100):
    # Kategorik sütunlardan benzersiz değer sayısı threshold'dan fazla olanları kaldırıyoruz.
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    high_card_cols = [col for col in cat_cols if df[col].nunique() > threshold]
    return df.drop(columns=high_card_cols)

def prepare_data(df, target_column='Win_Prob', drop_columns=['Date', 'time']):
    # Mevcut sütunlardan, drop_columns'ta tanımlı olanları seçiyoruz
    existing_drop_cols = [col for col in drop_columns if col in df.columns]
    y = df[[target_column]]
    X = df.drop(columns=[target_column] + existing_drop_cols)
    return X, y


def linear_model(X, y, threshold=0.5):
    # Eğitim ve test setlerine ayırma
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Kategorik değişkenler için one-hot encoding
    X_train_enc = pd.get_dummies(X_train, drop_first=True)
    X_test_enc = pd.get_dummies(X_test, drop_first=True)
    X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join='left', axis=1, fill_value=0)

    # Lineer regresyon modelini eğitme
    lr = LinearRegression()
    lr.fit(X_train_enc, y_train)
    y_pred = lr.predict(X_test_enc)

    # Regresyon metrikleri
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("Regression Metrics:")
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    print("R^2 Score:", r2)

    # Binary sınıflandırma değerlendirmesi
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_test_binary = (y_test.values >= threshold).astype(int)
    cm = confusion_matrix(y_test_binary, y_pred_binary)
    print("\nConfusion Matrix:\n", cm)
    acc = accuracy_score(y_test_binary, y_pred_binary)
    print("Accuracy Score:", acc)
    print("\nClassification Report:\n", classification_report(y_test_binary, y_pred_binary))

    return lr

# -------------------------
# Ana Çalıştırma
# -------------------------
# Varsayalım ki 'df' veri setimiz mevcut

# 1. MLE İle Imputation
df_mle = mle_imputation(df)
df_mle = drop_high_cardinality(df_mle)
X_mle, y_mle = prepare_data(df_mle)

print("\n--- Maximum Likelihood Model Eğitimi ---")
linear_model(X_mle, y_mle)

# 2. MICE İle Imputation
df_mice = mice_imputation(df)
# MICE için tüm veri sayısal olduğu için 'Date' ve 'time' sütunları olmayabilir.
# Hedef ve özellikleri uygun şekilde belirliyoruz:
X_mice, y_mice = prepare_data(df_mice, target_column='Win_Prob', drop_columns=[])
print("\n--- MICE Model Eğitimi ---")
linear_model(X_mice, y_mice)

# 3. EM Algoritması ile Imputation
df_em = em_imputation(df)
df_em = drop_high_cardinality(df_em)
X_em, y_em = prepare_data(df_em)
print("\n--- EM Model Eğitimi ---")
linear_model(X_em, y_em)
linear_model(X_mle, y_mle)
linear_model(X_mice, y_mice)


Downloading from https://www.kaggle.com/api/v1/datasets/download/maxhorowitz/nflplaybyplay2009to2016?dataset_version_number=6...


100%|██████████| 274M/274M [00:05<00:00, 57.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/maxhorowitz/nflplaybyplay2009to2016/versions/6


  df = pd.read_csv(path + '/NFL Play by Play 2009-2017 (v4).csv')


MLE ile tahmin edilen mu: 4.792306965211812
MLE ile tahmin edilen sigma: 1.8072323532892591


  df_imputed = df_copy.applymap(impute_value)


MLE imputasyon sonrası eksik değer sayısı:
Date        0
GameID      0
Drive       0
qtr         0
down        0
           ..
Win_Prob    0
WPA         0
airWPA      0
yacWPA      0
Season      0
Length: 102, dtype: int64

--- Maximum Likelihood Model Eğitimi ---
Regression Metrics:
Mean Squared Error (MSE): 16.179314549300095
Mean Absolute Error (MAE): 0.288404049847858
R^2 Score: -11.939243148215292

Confusion Matrix:
 [[807 105]
 [119 969]]
Accuracy Score: 0.888

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       912
           1       0.90      0.89      0.90      1088

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000


MICE sonrası ilk 5 satır:
         GameID  Drive  qtr      down  TimeUnder  TimeSecs  PlayTimeDiff  \
0  2.011122e+09    7.0  2.0  2.000000       15.0    2700.0           0.0  

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(mean, covariance, int(sample))
  rng.multivariate_normal(m


--- EM Model Eğitimi ---
Regression Metrics:
Mean Squared Error (MSE): 0.009110343803403295
Mean Absolute Error (MAE): 0.06194478706675693
R^2 Score: 0.8900430448589823

Confusion Matrix:
 [[ 850   62]
 [  24 1064]]
Accuracy Score: 0.957

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95       912
           1       0.94      0.98      0.96      1088

    accuracy                           0.96      2000
   macro avg       0.96      0.95      0.96      2000
weighted avg       0.96      0.96      0.96      2000

Regression Metrics:
Mean Squared Error (MSE): 16.179314549300095
Mean Absolute Error (MAE): 0.288404049847858
R^2 Score: -11.939243148215292

Confusion Matrix:
 [[807 105]
 [119 969]]
Accuracy Score: 0.888

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       912
           1       0.90      0.89      0.90      1088

    accuracy  