In [1]:
# Gerekli KÃ¼tÃ¼phaneler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import gc

## 1. Veriyi YÃ¼kle ve EksiÄŸi KaldÄ±r

In [2]:
print("Loading data...")
df = pd.read_csv('../pubg_predictor/data/train_V2.csv')
# Temizleme: 'winPlacePerc' sÃ¼tununda eksik deÄŸerleri kaldÄ±rdÄ±k
df = df.dropna(subset=['winPlacePerc'])
print("Data loaded and successfully cleaned.")

Loading data...
Data loaded and successfully cleaned.


## 2. Feature Engineering

In [6]:
print("Starting feature engineering...")

## 1. Total Distance
df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']

## 2. Health Items
df['healthItems'] = df['heals'] + df['boosts']

## 3. Headshot Rate
df['headshotRate'] = df['headshotKills'] / df['kills']
df['headshotRate'] = df['headshotRate'].fillna(0)

## 4. Teamwork
df['teamwork'] = df['assists'] + df['revives']

df = df.drop(columns=["Id", "groupId", "matchId"])
df = pd.get_dummies(df, columns=['matchType'])

print(f"Feature engineering completed! New shape: {df.shape}")

Starting feature engineering...
Feature engineering completed! New shape: (4446965, 45)


## 3. Model EÄŸitimi

In [7]:
# MODEL EÄžÄ°TÄ°MÄ°
X = df.drop(columns=['winPlacePerc'])
y = df['winPlacePerc']

# RAM temizliÄŸi
del df
gc.collect()

# Train Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
print("Training with new features...")
model = LGBMRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Tahmin ve Skor
preds = model.predict(X_val)
mae = mean_absolute_error(y_val, preds)

print("-" * 40)
print(f"ðŸ“‰ NEW MAE SCORE: {mae:.5f}")
print(f"Old Score (Baseline): 0.06367")
print("-" * 40)

Training with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3208
[LightGBM] [Info] Number of data points in the train set: 3557572, number of used features: 44
[LightGBM] [Info] Start training from score 0.472937
----------------------------------------
ðŸ“‰ NEW MAE SCORE: 0.06337
Old Score (Baseline): 0.06367
----------------------------------------


## 4. Analiz
GÃ¶rÃ¼ldÃ¼ÄŸÃ¼ Ã¼zere yaptÄ±ÄŸÄ±mÄ±z kolaylaÅŸtÄ±rmalar sonucunda 0.0003'lÃ¼k bir iyileÅŸme gerÃ§ekleÅŸti. LightGBM yeteri kadar zeki bir model olduÄŸu iÃ§in pek etki etmedi o nedenle baÅŸka yÃ¶ntemler deneyeceÄŸiz.

## 5. Advanced Feature Engineering

In [None]:
# Gereken KÃ¼tÃ¼phaneler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import gc


print("Loading data...")
df = pd.read_csv('../pubg_predictor/data/train_V2.csv')
# Temizleme: 'winPlacePerc' sÃ¼tununda eksik deÄŸerleri kaldÄ±rdÄ±k
df = df.dropna(subset=['winPlacePerc'])
print("Data loaded and successfully cleaned.")

# --- FEATURE ENGINEERING 1: BASÄ°T TÃœRETMELER ---
print("Basic Features are being created...")
df['totalDistance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
df['healthItems'] = df['heals'] + df['boosts']
df['headshotRate'] = df['headshotKills'] / df['kills']
df['headshotRate'] = df['headshotRate'].fillna(0)
df['teamwork'] = df['revives'] + df['assists']

# --- FEATURE ENGINEERING 2: GRUPLAMALAR ---
print("Advanced Features are being created. This may take a while...")

# TakÄ±m (GroupId) BÃ¼yÃ¼klÃ¼ÄŸÃ¼: TakÄ±mda kaÃ§ kiÅŸi var?
# (BazÄ± modlarda takÄ±m arkadaÅŸÄ± sayÄ±sÄ± avantaj saÄŸlar)
df['groupSize'] = df.groupby('groupId')['groupId'].transform('count')

# MaÃ§taki Oyuncu SayÄ±sÄ±: (100 kiÅŸilik maÃ§ ile 80 kiÅŸilik maÃ§ farklÄ±dÄ±r)
df['matchSize'] = df.groupby('matchId')['matchId'].transform('count')

# Oyuncunun baÅŸarÄ±sÄ±nÄ± maÃ§Ä±n ortalamasÄ±yla kÄ±yasla her maÃ§Ä±n kendi iÃ§indeki 'kills' ve 'damageDealt' ortalamasÄ±nÄ± buluyoruz
df['matchMeanKills'] = df.groupby('matchId')['kills'].transform('mean')
df['matchMeanDamage'] = df.groupby('matchId')['damageDealt'].transform('mean')

# Oyuncunun performansÄ± / MaÃ§ ortalamasÄ± (Rank normalization)
df['killsRel'] = df['kills'] / df['matchMeanKills']
df['damageRel'] = df['damageDealt'] / df['matchMeanDamage']
df['killsRel'] = df['killsRel'].fillna(0)
df['damageRel'] = df['damageRel'].fillna(0)

# TakÄ±m iÃ§i maksimum performans (TakÄ±mÄ±n en iyisi kim?)
df['maxTeamKills'] = df.groupby('groupId')['kills'].transform('max')
df['maxTeamDamage'] = df.groupby('groupId')['damageDealt'].transform('max')

# -------------------------------------------------------

# ID'lere ihtiyacÄ±mÄ±z yok
df = df.drop(columns=['Id', 'groupId', 'matchId'])

# Kategorik DÃ¶nÃ¼ÅŸÃ¼m
df = pd.get_dummies(df, columns=['matchType'])

# MODEL EÄžÄ°TÄ°MÄ°
print("Model is being trained with new features...")
X = df.drop(columns=['winPlacePerc'])
y = df['winPlacePerc']

# RAM TemizliÄŸi
del df
gc.collect()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_val)
mae = mean_absolute_error(y_val, preds)

print("-" * 40)
print(f"ðŸš€ NEW MAE SCORE: {mae:.5f}")
print("Old Score: 0.06337")
print("-" * 40)

Loading data...
Data loaded and successfully cleaned.
Basic Features are being created...
Advanced Features are being created. This may take a while...
Model is being trained with new features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4645
[LightGBM] [Info] Number of data points in the train set: 3557572, number of used features: 52
[LightGBM] [Info] Start training from score 0.472937
----------------------------------------
ðŸš€ NEW MAE SCORE: 0.05892
Old Score: 0.06337
----------------------------------------


## 6. Analiz-2
Ham verilerle eÄŸitilen baseline modelin hatasÄ± 0.0636 iken, 'maÃ§ iÃ§i ortalama performans' (relative performance) ve 'grup istatistikleri' eklendiÄŸinde hata 0.0589'a dÃ¼ÅŸtÃ¼. Bu durum, bir oyuncunun mutlak istatistiklerinden ziyade, rakiplerine gÃ¶re ne kadar iyi oynadÄ±ÄŸÄ±nÄ±n sÄ±ralamayÄ± belirlediÄŸini kanÄ±tlamaktadÄ±r. YapmÄ±ÅŸ olduÄŸumuz yeni geliÅŸtirmeler iÅŸe yaradÄ± 0.06337 hata payÄ±ndan, 0.05892'ye dÃ¼ÅŸtÃ¼.