# Lab 4: Aholi o'sishi-kamayishi bo'yicha Decision Tree Regression

**Dataset:** World Development Indicators — Population growth (annual %)

**Qadamlar:**
1. Datasetni yuklash
2. Pipeline va Preprocessing
3. Matplotlib bilan vizualizatsiya
4. Decision Tree Regression modeli

## 1. Kutubxonalarni import qilish va datasetni yuklash

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Datasetni yuklash (birinchi 2 qator metadata, 3-qator header)
df = pd.read_csv('lab4.csv', skiprows=4)
print(f"Dataset shakli: {df.shape}")
df.head()

## 2. Preprocessing va Pipeline

In [None]:
# Keraksiz ustunlarni olib tashlash
df = df.drop(columns=['Indicator Name', 'Indicator Code'], errors='ignore')

# Oxirgi bo'sh ustunni olib tashlash (agar mavjud bo'lsa)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Bo'sh qatorlarni olib tashlash
df = df.dropna(how='all')

print(f"Tozalangandan keyin: {df.shape}")
df.head()

In [None]:
# Ma'lumotlarni "wide" formatdan "long" formatga o'tkazish (melt)
year_columns = [col for col in df.columns if col.isdigit()]

df_long = df.melt(
    id_vars=['Country Name', 'Country Code'],
    value_vars=year_columns,
    var_name='Year',
    value_name='Population_Growth'
)

df_long['Year'] = df_long['Year'].astype(int)
df_long['Population_Growth'] = pd.to_numeric(df_long['Population_Growth'], errors='coerce')

print(f"Long format: {df_long.shape}")
df_long.head(10)

In [None]:
# Bo'sh qiymatlar tahlili
print("Bo'sh qiymatlar soni:")
print(df_long.isnull().sum())
print(f"\nUmumiy qatorlar: {len(df_long)}")
print(f"Bo'sh Population_Growth: {df_long['Population_Growth'].isnull().sum()}")
print(f"Bo'sh qiymatlar ulushi: {df_long['Population_Growth'].isnull().mean():.2%}")

In [None]:
# Bo'sh qiymatlarni olib tashlash
df_clean = df_long.dropna(subset=['Population_Growth']).reset_index(drop=True)

print(f"Tozalangan dataset: {df_clean.shape}")
print(f"\nDavlatlar soni: {df_clean['Country Name'].nunique()}")
print(f"Yillar diapazoni: {df_clean['Year'].min()} - {df_clean['Year'].max()}")
df_clean.describe()

In [None]:
# LabelEncoder — davlat nomlarini raqamga kodlash
le = LabelEncoder()
df_clean['Country_Encoded'] = le.fit_transform(df_clean['Country Name'])

print("Kodlangan ustunlar:")
df_clean[['Country Name', 'Country_Encoded', 'Year', 'Population_Growth']].head(10)

## 3. Vizualizatsiya (Matplotlib)

In [None]:
# Top 10 eng yuqori o'rtacha aholi o'sishiga ega davlatlar
top10 = df_clean.groupby('Country Name')['Population_Growth'].mean().nlargest(10)

plt.figure(figsize=(12, 6))
top10.plot(kind='barh', color='steelblue', edgecolor='black')
plt.xlabel('O\'rtacha aholi o\'sishi (%)')
plt.ylabel('Davlat')
plt.title('Top 10: Eng yuqori o\'rtacha aholi o\'sishi')
plt.tight_layout()
plt.show()

In [None]:
# Top 5 davlat uchun yillar bo'yicha aholi o'sishi
top5_countries = df_clean.groupby('Country Name')['Population_Growth'].mean().nlargest(5).index

plt.figure(figsize=(14, 6))
for country in top5_countries:
    data = df_clean[df_clean['Country Name'] == country]
    plt.plot(data['Year'], data['Population_Growth'], label=country, linewidth=1.5)

plt.xlabel('Yil')
plt.ylabel('Aholi o\'sishi (%)')
plt.title('Top 5 davlat: Yillar bo\'yicha aholi o\'sishi')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Dunyo bo'yicha o'rtacha aholi o'sishi tendentsiyasi
world_avg = df_clean.groupby('Year')['Population_Growth'].mean()

plt.figure(figsize=(12, 5))
plt.plot(world_avg.index, world_avg.values, color='darkred', linewidth=2)
plt.fill_between(world_avg.index, world_avg.values, alpha=0.2, color='red')
plt.xlabel('Yil')
plt.ylabel('O\'rtacha aholi o\'sishi (%)')
plt.title('Dunyo bo\'yicha o\'rtacha aholi o\'sishi tendentsiyasi')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Population Growth taqsimoti (histogram)
plt.figure(figsize=(10, 5))
plt.hist(df_clean['Population_Growth'], bins=50, color='teal', edgecolor='black', alpha=0.7)
plt.axvline(df_clean['Population_Growth'].mean(), color='red', linestyle='--', label=f'O\'rtacha: {df_clean["Population_Growth"].mean():.2f}%')
plt.xlabel('Aholi o\'sishi (%)')
plt.ylabel('Chastota')
plt.title('Aholi o\'sishi taqsimoti')
plt.legend()
plt.tight_layout()
plt.show()

## 4. Decision Tree Regression

In [None]:
# Feature va targetni ajratish
X = df_clean[['Country_Encoded', 'Year']]
y = df_clean['Population_Growth']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")

In [None]:
# Pipeline: Imputer + Scaler + DecisionTreeRegressor
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', DecisionTreeRegressor(max_depth=10, min_samples_split=5, random_state=42))
])

# Modelni o'qitish
pipeline.fit(X_train, y_train)

# Bashorat qilish
y_pred = pipeline.predict(X_test)

print("Model o'qitildi!")

In [None]:
# Natijalarni baholash
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("=" * 45)
print("   Decision Tree Regression Natijalari")
print("=" * 45)
print(f"  MSE  (Mean Squared Error):    {mse:.4f}")
print(f"  RMSE (Root Mean Squared Error):{rmse:.4f}")
print(f"  MAE  (Mean Absolute Error):   {mae:.4f}")
print(f"  R2   (R-squared Score):        {r2:.4f}")
print("=" * 45)

In [None]:
# Haqiqiy vs Bashorat qilingan qiymatlar
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3, color='steelblue', edgecolor='k', s=20)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2, label='Ideal')
plt.xlabel('Haqiqiy qiymatlar')
plt.ylabel('Bashorat qilingan qiymatlar')
plt.title(f'Decision Tree Regression: Haqiqiy vs Bashorat (R2={r2:.4f})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Xatoliklar taqsimoti
residuals = y_test - y_pred

plt.figure(figsize=(10, 5))
plt.hist(residuals, bins=50, color='salmon', edgecolor='black', alpha=0.7)
plt.axvline(0, color='black', linestyle='--', linewidth=1.5)
plt.xlabel('Xatolik (Haqiqiy - Bashorat)')
plt.ylabel('Chastota')
plt.title('Xatoliklar taqsimoti (Residuals)')
plt.tight_layout()
plt.show()

In [None]:
# Decision Tree vizualizatsiyasi (cheklangan chuqurlik bilan)
tree_model = pipeline.named_steps['model']

plt.figure(figsize=(20, 10))
plot_tree(tree_model, max_depth=3, feature_names=['Country_Encoded', 'Year'],
          filled=True, rounded=True, fontsize=8)
plt.title('Decision Tree (birinchi 3 daraja)')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
importances = tree_model.feature_importances_
features = ['Country_Encoded', 'Year']

plt.figure(figsize=(8, 4))
plt.barh(features, importances, color=['steelblue', 'coral'], edgecolor='black')
plt.xlabel('Muhimlik darajasi')
plt.title('Feature Importance')
for i, v in enumerate(importances):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')
plt.tight_layout()
plt.show()

## Xulosa

1. **Dataset** — 266 ta davlat uchun 1960-2025 yillar orasidagi aholi o'sishi (annual %) ma'lumotlari qayta ishlandi
2. **Preprocessing** — ma'lumotlar wide formatdan long formatga o'tkazildi, bo'sh qiymatlar tozalandi, LabelEncoder qo'llanildi
3. **Pipeline** — SimpleImputer + StandardScaler + DecisionTreeRegressor pipeline yaratildi
4. **Natija** — Model MSE, RMSE, MAE va R2 metrikalari orqali baholandi