# Kahve T√ºketimi ve Stres Analizi (Saf Versiyon - Bias-Free)

Bu analizde, "Kahve i√ßme nedeni?" s√ºtunu analizden √ßƒ±karƒ±larak, stres ve kahve t√ºketimi arasƒ±ndaki ili≈üki daha objektif bir ≈üekilde incelenmi≈ütir. Bu sayede "Stresle ba≈üa √ßƒ±kma" cevabƒ±nƒ±n yaratabileceƒüi d√∂ng√ºsel mantƒ±k (Hali hazƒ±rda stresli olduƒüu i√ßin kahve i√ßme durumu) elenmi≈ütir.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Veri Y√ºkleme ve Temizleme
print("="*80)
print("KAHVE T√úKETƒ∞Mƒ∞ VE STRES ƒ∞Lƒ∞≈ûKƒ∞Sƒ∞ - SAF ANALƒ∞Z")
print("="*80)

df = pd.read_csv('anket.csv')
print(f"\n‚úì Veri y√ºklendi: {df.shape[0]} katƒ±lƒ±mcƒ±, {df.shape[1]} s√ºtun")

# Gereksiz ve yanlƒ±lƒ±k yaratabilecek s√ºtunlarƒ±n √ßƒ±karƒ±lmasƒ±
drop_columns = ['Zaman damgasƒ±', 'Kahve i√ßme nedeni? ']
df_clean = df.drop(drop_columns, axis=1)

print(f"\n‚ö†Ô∏è 'Kahve i√ßme nedeni?' s√ºtunu analizden √ßƒ±karƒ±ldƒ± (Bias √ñnleme).")

# S√ºtun isimlerini d√ºzenleme
df_clean.columns = ['Yas', 'Cinsiyet', 'Is_Yogunlugu', 'Kahve_Miktar', 
                     'Kahve_Zamani', 'Kahve_Hissi', 'Stres_Duzeyi', 
                     'Uyku_Suresi', 'Ruh_Hali', 'Stresli_Kahve']

def simplify(value):
    if pd.isna(value):
        return value
    if ',' in str(value):
        return str(value).split(',')[0].strip()
    return str(value)

df_ml = df_clean.copy()
for col in ['Kahve_Zamani', 'Kahve_Hissi', 'Uyku_Suresi', 'Ruh_Hali']:
    df_ml[col] = df_ml[col].apply(simplify)

# Label Encoding
df_encoded = df_ml.copy()
for column in df_encoded.columns:
    if df_encoded[column].dtype == 'object':
        le = LabelEncoder()
        df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))

In [None]:
# ADIM 1: KORELASYON ANALƒ∞Zƒ∞
print("\n" + "="*80)
print("ADIM 1: KORELASYON ANALƒ∞Zƒ∞")
print("="*80)

correlation_matrix = df_encoded.corr()
stres_correlations = correlation_matrix['Stres_Duzeyi'].sort_values(ascending=False)

print("\nüìä Stres D√ºzeyi ile Diƒüer Deƒüi≈ükenler Arasƒ±ndaki Korelasyon:")
print("-"*80)
for feature, corr_value in stres_correlations.items():
    if feature != 'Stres_Duzeyi':
        if abs(corr_value) >= 0.3:
            guc = "üî¥ G√ú√áL√ú"
        elif abs(corr_value) >= 0.15:
            guc = "üü° ORTA "
        else:
            guc = "üü¢ ZAYIF"
        print(f"{guc} - {feature:20s}: {corr_value:+.4f}")

In [None]:
# ADIM 2: OZELLIK ONEM ANALƒ∞Zƒ∞ (Feature Importance)
print("\n" + "="*80)
print("ADIM 2: FEATURE IMPORTANCE ANALƒ∞Zƒ∞ (Random Forest)")
print("="*80)

X = df_encoded.drop('Stres_Duzeyi', axis=1)
y = df_encoded['Stres_Duzeyi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

rf_importance = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10)
rf_importance.fit(X_train, y_train)

feature_imp_df = pd.DataFrame({
    'Ozellik': X.columns,
    'Onem_Skoru': rf_importance.feature_importances_
}).sort_values('Onem_Skoru', ascending=False)

print("\nüìä √ñzellik √ñnem Skorlarƒ±:")
print("-"*80)
for idx, row in feature_imp_df.iterrows():
    bar = '‚ñà' * int(row['Onem_Skoru'] * 100)
    if row['Onem_Skoru'] > 0.10:
        onem = "üî¥ √áOK √ñNEMLƒ∞"
    elif row['Onem_Skoru'] > 0.05:
        onem = "üü° √ñNEMLƒ∞    "
    else:
        onem = "üü¢ AZ √ñNEMLƒ∞ "
    print(f"{onem} - {row['Ozellik']:20s}: {row['Onem_Skoru']:.4f} {bar}")

In [None]:
# ADIM 3: KAR≈ûILA≈ûTIRMA VE SE√áƒ∞M
print("\n" + "="*80)
print("ADIM 3: KORELASYON + FEATURE IMPORTANCE KAR≈ûILA≈ûTIRMASI")
print("="*80)

comparison_df = pd.DataFrame({
    'Ozellik': X.columns,
    'Korelasyon': [stres_correlations[col] for col in X.columns],
    'Feature_Importance': rf_importance.feature_importances_
})
comparison_df['Abs_Korelasyon'] = comparison_df['Korelasyon'].abs()
comparison_df = comparison_df.sort_values('Feature_Importance', ascending=False)

print("\nüìä Her ƒ∞ki Analize G√∂re √ñzellik Deƒüerlendirmesi:")
print("-"*80)
print(f"{'√ñzellik':<20} {'Korelasyon':>12} {'Feature Imp':>15} {'Karar':>15}")
print("-"*80)

onemli_ozellikler = []
onemsiz_ozellikler = []

for _, row in comparison_df.iterrows():
    ozellik = row['Ozellik']
    korr = row['Abs_Korelasyon']
    fi = row['Feature_Importance']
    
    if korr >= 0.10 or fi >= 0.05:
        karar = "‚úÖ KULLAN"
        onemli_ozellikler.append(ozellik)
    else:
        karar = "‚ùå √áIKAR"
        onemsiz_ozellikler.append(ozellik)
    
    print(f"{ozellik:<20} {row['Korelasyon']:>+12.4f} {fi:>15.4f} {karar:>15}")

print(f"\n‚úÖ Kullanƒ±lacak √∂zellik sayƒ±sƒ±: {len(onemli_ozellikler)}")
if len(onemsiz_ozellikler) > 0:
    print(f"‚ùå √áƒ±karƒ±lacak √∂zellikler: {', '.join(onemsiz_ozellikler)}")
    print(f"   (D√º≈ü√ºk korelasyon ve feature importance nedeniyle)")

In [None]:
# ADIM 4: MODEL Eƒûƒ∞Tƒ∞Mƒ∞ (Se√ßilen √ñzelliklerle)
print("\n" + "="*80)
print("ADIM 4: SE√áƒ∞LEN √ñZELLƒ∞KLERLE MODEL Eƒûƒ∞Tƒ∞Mƒ∞")
print("="*80)

X_selected = X[onemli_ozellikler]
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nOrijinal √∂zellik sayƒ±sƒ±: {X.shape[1]}")
print(f"Se√ßilen √∂zellik sayƒ±sƒ±: {len(onemli_ozellikler)}")
print(f"Azaltma oranƒ±: %{(1 - len(onemli_ozellikler)/X.shape[1])*100:.1f}")

# Model 1: KNN
print("\n1. K-NEAREST NEIGHBORS (KNN)")
print("-"*40)
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_sel, y_train_sel)
knn_acc = accuracy_score(y_test_sel, knn.predict(X_test_sel))
print(f"   Doƒüruluk: %{knn_acc*100:.2f}")

# Model 2: Decision Tree
print("\n2. DECISION TREE")
print("-"*40)
dt = DecisionTreeClassifier(max_depth=8, random_state=42)
dt.fit(X_train_sel, y_train_sel)
dt_acc = accuracy_score(y_test_sel, dt.predict(X_test_sel))
print(f"   Doƒüruluk: %{dt_acc*100:.2f}")

# Model 3: Random Forest
print("\n3. RANDOM FOREST")
print("-"*40)
rf = RandomForestClassifier(n_estimators=150, max_depth=6, random_state=42)
rf.fit(X_train_sel, y_train_sel)
rf_acc = accuracy_score(y_test_sel, rf.predict(X_test_sel))
print(f"   Doƒüruluk: %{rf_acc*100:.2f}")

# SONU√áLAR
models = [('KNN', knn_acc), ('Decision Tree', dt_acc), ('Random Forest', rf_acc)]
best_model = max(models, key=lambda x: x[1])

print("\n" + "="*80)
print("SONU√áLAR")
print("="*80)

print("\nüìä MODEL PERFORMANSLARI:")
for model_name, acc in sorted(models, key=lambda x: x[1], reverse=True):
    star = "üèÜ " if model_name == best_model[0] else "   "
    print(f"{star}{model_name:20s}: %{acc*100:.2f}")

print(f"\nüèÜ EN ƒ∞Yƒ∞ MODEL: {best_model[0]} (%{best_model[1]*100:.2f})")