# İş Anlama (Business Understanding):
Amaç:
Hindistan’daki 1901–2015 yıllarına ait aylık yağış verilerini kullanarak
yağış miktarlarının birliktelik ilişkilerini (association rules) analiz etmek.
Projenin amacı, uzun dönemli iklim verilerinde yağış miktarlarının ortak desenlerini belirlemek ve “düşük-orta-yüksek” yağış kategorileri arasında ilişki kurallarını keşfetmektir.


# Veri Anlama (Data Understanding)

In [8]:
import pandas as pd 
import numpy as np
df=pd.read_csv("rainfall in india 1901-2015.csv")
df.head()


Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec
0,ANDAMAN & NICOBAR ISLANDS,1901,49.2,87.1,29.2,2.3,528.8,517.5,365.1,481.1,332.6,388.5,558.2,33.6,3373.2,136.3,560.3,1696.3,980.3
1,ANDAMAN & NICOBAR ISLANDS,1902,0.0,159.8,12.2,0.0,446.1,537.1,228.9,753.7,666.2,197.2,359.0,160.5,3520.7,159.8,458.3,2185.9,716.7
2,ANDAMAN & NICOBAR ISLANDS,1903,12.7,144.0,0.0,1.0,235.1,479.9,728.4,326.7,339.0,181.2,284.4,225.0,2957.4,156.7,236.1,1874.0,690.6
3,ANDAMAN & NICOBAR ISLANDS,1904,9.4,14.7,0.0,202.4,304.5,495.1,502.0,160.1,820.4,222.2,308.7,40.1,3079.6,24.1,506.9,1977.6,571.0
4,ANDAMAN & NICOBAR ISLANDS,1905,1.3,0.0,3.3,26.9,279.5,628.7,368.7,330.5,297.0,260.7,25.4,344.7,2566.7,1.3,309.7,1624.9,630.8


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4116 entries, 0 to 4115
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SUBDIVISION  4116 non-null   object 
 1   YEAR         4116 non-null   int64  
 2   JAN          4112 non-null   float64
 3   FEB          4113 non-null   float64
 4   MAR          4110 non-null   float64
 5   APR          4112 non-null   float64
 6   MAY          4113 non-null   float64
 7   JUN          4111 non-null   float64
 8   JUL          4109 non-null   float64
 9   AUG          4112 non-null   float64
 10  SEP          4110 non-null   float64
 11  OCT          4109 non-null   float64
 12  NOV          4105 non-null   float64
 13  DEC          4106 non-null   float64
 14  ANNUAL       4090 non-null   float64
 15  Jan-Feb      4110 non-null   float64
 16  Mar-May      4107 non-null   float64
 17  Jun-Sep      4106 non-null   float64
 18  Oct-Dec      4103 non-null   float64
dtypes: flo

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,4116.0,1958.218659,33.140898,1901.0,1930.0,1958.0,1987.0,2015.0
JAN,4112.0,18.95732,33.585371,0.0,0.6,6.0,22.2,583.7
FEB,4113.0,21.805325,35.909488,0.0,0.6,6.7,26.8,403.5
MAR,4110.0,27.359197,46.959424,0.0,1.0,7.8,31.3,605.6
APR,4112.0,43.127432,67.831168,0.0,3.0,15.7,49.95,595.1
MAY,4113.0,85.745417,123.234904,0.0,8.6,36.6,97.2,1168.6
JUN,4111.0,230.234444,234.710758,0.4,70.35,138.7,305.15,1609.9
JUL,4109.0,347.214334,269.539667,0.0,175.6,284.8,418.4,2362.8
AUG,4112.0,290.263497,188.770477,0.0,155.975,259.4,377.8,1664.6
SEP,4110.0,197.361922,135.408345,0.1,100.525,173.9,265.8,1222.0


In [11]:
df.isna().sum()

SUBDIVISION     0
YEAR            0
JAN             4
FEB             3
MAR             6
APR             4
MAY             3
JUN             5
JUL             7
AUG             4
SEP             6
OCT             7
NOV            11
DEC            10
ANNUAL         26
Jan-Feb         6
Mar-May         9
Jun-Sep        10
Oct-Dec        13
dtype: int64

# Veri Hazırlama (Data Preparation)

In [12]:
# "NA" ifadelerini boş değere çevir
df = df.replace("NA", pd.NA)

# Sayısal sütunları dönüştür
numeric_cols = df.columns.drop(["SUBDIVISION", "YEAR"])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Eksik değerleri medyan ile doldur
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Sayısal verileri Low/Medium/High olarak kategorize et
def categorize(x):
    if x < 100:
        return "Low"
    elif x < 300:
        return "Medium"
    else:
        return "High"

df[numeric_cols] = df[numeric_cols].applymap(categorize)
df.head()


  df[numeric_cols] = df[numeric_cols].applymap(categorize)


Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec
0,ANDAMAN & NICOBAR ISLANDS,1901,Low,Low,Low,Low,High,High,High,High,High,High,High,Low,High,Medium,High,High,High
1,ANDAMAN & NICOBAR ISLANDS,1902,Low,Medium,Low,Low,High,High,Medium,High,High,Medium,High,Medium,High,Medium,High,High,High
2,ANDAMAN & NICOBAR ISLANDS,1903,Low,Medium,Low,Low,Medium,High,High,High,High,Medium,Medium,Medium,High,Medium,Medium,High,High
3,ANDAMAN & NICOBAR ISLANDS,1904,Low,Low,Low,Medium,High,High,High,Medium,High,Medium,High,Low,High,Low,High,High,High
4,ANDAMAN & NICOBAR ISLANDS,1905,Low,Low,Low,Low,Medium,High,High,High,Medium,Medium,Low,High,High,Low,High,High,High


# Modelleme (Modeling – Apriori & Association Rules)

In [13]:
from mlxtend.frequent_patterns import apriori, association_rules

# Kategorik veriyi one-hot encode et
df_encoded = pd.get_dummies(df[numeric_cols])

# Apriori algoritması
frequent_items = apriori(df_encoded, min_support=0.3, use_colnames=True)
rules = association_rules(frequent_items, metric="confidence", min_threshold=0.7)
rules.sort_values(by='lift', ascending=False).head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1381691,"(APR_Low, MAY_Low, Jun-Sep_High, Oct-Dec_Low, ...","(FEB_Low, DEC_Low, NOV_Low, JAN_Low, Mar-May_L...",0.385811,0.413265,0.353499,0.916247,2.217091,1.0,0.194056,7.005524,0.893795,0.793348,0.857256,0.885813
1379100,"(FEB_Low, DEC_Low, NOV_Low, JAN_Low, Mar-May_L...","(APR_Low, MAY_Low, Jun-Sep_High, Oct-Dec_Low, ...",0.413265,0.385811,0.353499,0.855379,2.217091,1.0,0.194056,4.246889,0.935616,0.793348,0.764534,0.885813
1381692,"(APR_Low, MAY_Low, ANNUAL_High, Jan-Feb_Low, O...","(FEB_Low, DEC_Low, NOV_Low, JAN_Low, Mar-May_L...",0.39723,0.402332,0.353499,0.889908,2.211873,1.0,0.19368,5.428814,0.908962,0.792484,0.815798,0.884266
1379099,"(FEB_Low, DEC_Low, NOV_Low, JAN_Low, Mar-May_L...","(APR_Low, MAY_Low, ANNUAL_High, Jan-Feb_Low, O...",0.402332,0.39723,0.353499,0.878623,2.211873,1.0,0.19368,4.966102,0.916721,0.792484,0.798635,0.884266
1380704,"(FEB_Low, NOV_Low, JAN_Low, Mar-May_Low, OCT_L...","(DEC_Low, APR_Low, MAY_Low, Jun-Sep_High, Oct-...",0.414723,0.385811,0.353499,0.852373,2.209298,1.0,0.193494,4.160396,0.935228,0.790761,0.759638,0.88431


# Değerlendirme (Evaluation)

In [14]:
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(FEB_Low),(JAN_Low),0.930515,0.975796,1.010409
1,(JAN_Low),(FEB_Low),0.930515,0.963522,1.010409
2,(JAN_Low),(MAR_Low),0.903304,0.935346,1.01153
3,(MAR_Low),(JAN_Low),0.903304,0.976879,1.01153
4,(APR_Low),(JAN_Low),0.83965,0.968881,1.003249
5,(JAN_Low),(APR_Low),0.83965,0.869434,1.003249
6,(JAN_Low),(MAY_Low),0.731535,0.757484,1.001865
7,(MAY_Low),(JAN_Low),0.731535,0.967545,1.001865
8,(JUN_Low),(JAN_Low),0.343294,0.947051,0.980644
9,(JUN_Medium),(JAN_Low),0.375607,0.977862,1.012549


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# 1) CSV'yi oku
df = pd.read_csv("rainfall in india 1901-2015.csv", na_values=["NA"])

# 2) SUBDIVISION ve YEAR dışındaki yağış sütunlarını bul
all_cols = df.columns.tolist()
drop_cols = [c for c in ["SUBDIVISION", "YEAR"] if c in all_cols]
rain_cols = [c for c in all_cols if c not in drop_cols]

# 3) Yağış sütunlarını sayısala çevir
df[rain_cols] = df[rain_cols].apply(pd.to_numeric, errors="coerce")

# 4) Tamamen boş (hep NaN) sütunları at
non_empty_cols = [c for c in rain_cols if not df[c].isna().all()]
df = df[drop_cols + non_empty_cols]
rain_cols = non_empty_cols

print("Kullanılabilir sütunlar:", rain_cols)


Kullanılabilir sütunlar: ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'ANNUAL', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']


In [16]:
# Hedef olarak JUN'u seç (varsa):
if "JUN" not in rain_cols:
    raise ValueError("JUN sütunu bulunamadı, listede olanlardan biri hedef seçilmeli.")

target_col = "JUN"

# X: JUN dışındaki tüm yağış sütunları
feature_cols = [c for c in rain_cols if c != target_col]

# Eksikleri her sütunun medyanıyla doldur
df[feature_cols + [target_col]] = df[feature_cols + [target_col]].fillna(
    df[feature_cols + [target_col]].median()
)

X = df[feature_cols]
y = df[target_col]

print("X shape:", X.shape)
print("y örnek:", y.head())


X shape: (4116, 16)
y örnek: 0    517.5
1    537.1
2    479.9
3    495.1
4    628.7
Name: JUN, dtype: float64


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [22]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    
    "SVR": SVR(kernel="rbf")
}

results = []

print("Model Performansları:")
for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        "Model": name,
        "R2": r2,
        "RMSE": rmse
    })
    print(f"{name}: R2={r2:.4f}, RMSE={rmse:.4f}")


Model Performansları:
LinearRegression: R2=0.9993, RMSE=76.6247
DecisionTree: R2=0.7777, RMSE=76.6247
RandomForest: R2=0.8925, RMSE=76.6247
GradientBoosting: R2=0.8884, RMSE=76.6247
SVR: R2=0.7285, RMSE=76.6247
