<h1> Exploratory Data Analysis</h1>

In [2]:
import pandas as pd
import re

# Load
df = pd.read_csv("avito_scraped_data2.csv")
shape = df.shape
missing_values = df.isnull().sum()
preview = df.head(5)

shape,missing_values, preview

((9018, 6),
 type_and_location       0
 price                2056
 room                 4411
 bathroom             4417
 surface              5333
 link                    0
 dtype: int64,
                                  type_and_location      price  room  bathroom  \
 0              Appartements dans Temara, Massira 1    970‚ÄØ000   3.0       2.0   
 1   Appartements dans Mohammedia, Quartier du Parc    940‚ÄØ000   1.0       1.0   
 2    Villas et Riads dans Mohammedia, Centre Ville  3‚ÄØ750‚ÄØ000   4.0       1.0   
 3               Maisons dans Agadir, Autre secteur        NaN   3.0       4.0   
 4  Appartements dans Ksar el-Kebir, Toute la ville    320‚ÄØ000   2.0       1.0   
 
   surface                                               link  
 0   98 m¬≤  https://www.avito.ma/fr/massira_1/appartements...  
 1   65 m¬≤  https://www.avito.ma/fr/quartier_du_parc/appar...  
 2     NaN  https://www.avito.ma/fr/centre_ville/villas_et...  
 3   83 m¬≤  https://www.avito.ma/fr/autre_secteu

<h2>Split 'type_and_location' into 3 columns</h2>

In [3]:
split_cols = df['type_and_location'].str.split('dans', expand=True)

df['property_type'] = split_cols[0]
df['location'] = split_cols[1]
split2=df['location'].str.split(', ', expand=True)
df['city']=split2[0]
df['zone'] = split2[1]

df.drop(columns=['location'], inplace=True)
df.drop(columns=['type_and_location'], inplace=True)
df.head(5)

Unnamed: 0,price,room,bathroom,surface,link,property_type,city,zone
0,970‚ÄØ000,3.0,2.0,98 m¬≤,https://www.avito.ma/fr/massira_1/appartements...,Appartements,Temara,Massira 1
1,940‚ÄØ000,1.0,1.0,65 m¬≤,https://www.avito.ma/fr/quartier_du_parc/appar...,Appartements,Mohammedia,Quartier du Parc
2,3‚ÄØ750‚ÄØ000,4.0,1.0,,https://www.avito.ma/fr/centre_ville/villas_et...,Villas et Riads,Mohammedia,Centre Ville
3,,3.0,4.0,83 m¬≤,https://www.avito.ma/fr/autre_secteur/maisons/...,Maisons,Agadir,Autre secteur
4,320‚ÄØ000,2.0,1.0,80 m¬≤,https://www.avito.ma/fr/ksar_el_kebir/appartem...,Appartements,Ksar el-Kebir,Toute la ville


<h2>Clean price</h2>

In [4]:
# Remove non-digits and convert to int
df["price"] = (
    df["price"]
    .str.replace(r"[^\d]", "", regex=True)
    .replace("", pd.NA)
    .astype("float64")
)
df = df.dropna(subset=['price'])
df.reset_index(drop=True, inplace=True)
shape = df.shape
missing_values = df.isnull().sum()
preview = df.head(5)

shape,missing_values, preview

((6962, 8),
 price               0
 room             3300
 bathroom         3286
 surface          3977
 link                0
 property_type       0
 city                0
 zone             1558
 dtype: int64,
        price  room  bathroom surface  \
 0   970000.0   3.0       2.0   98 m¬≤   
 1   940000.0   1.0       1.0   65 m¬≤   
 2  3750000.0   4.0       1.0     NaN   
 3   320000.0   2.0       1.0   80 m¬≤   
 4  1210000.0   2.0       2.0   86 m¬≤   
 
                                                 link     property_type  \
 0  https://www.avito.ma/fr/massira_1/appartements...     Appartements    
 1  https://www.avito.ma/fr/quartier_du_parc/appar...     Appartements    
 2  https://www.avito.ma/fr/centre_ville/villas_et...  Villas et Riads    
 3  https://www.avito.ma/fr/ksar_el_kebir/appartem...     Appartements    
 4  https://www.avito.ma/fr/mimosas/appartements/A...     Appartements    
 
              city              zone  
 0          Temara         Massira 1  
 1     

In [5]:
print(df['price'].describe())
print(df['price'].value_counts().sort_index().head(20))

count    6.962000e+03
mean     1.825990e+06
std      3.352526e+06
min      1.000000e+02
25%      5.740000e+05
50%      9.598000e+05
75%      1.760000e+06
max      1.000000e+08
Name: price, dtype: float64
price
100.0     1
108.0     1
110.0     1
125.0     1
150.0     2
160.0     1
200.0     1
219.0     2
220.0     1
299.0     1
330.0     1
350.0     1
400.0     6
450.0     1
500.0     2
600.0     3
650.0     1
700.0     2
800.0     1
1000.0    2
Name: count, dtype: int64


In [6]:
price_min = 200000

# Avant filtrage : pour info
print(f"Nombre d'annonces avant filtrage: {df.shape[0]}")

# Filtrage : on garde uniquement les lignes avec une surface raisonnable
df = df[(df['price'] >= price_min)]

# Apr√®s filtrage
print(f"Nombre d'annonces apr√®s filtrage: {df.shape[0]}")

# V√©rif rapide : affichage des surfaces restantes
print(df['price'].describe())

Nombre d'annonces avant filtrage: 6962
Nombre d'annonces apr√®s filtrage: 6596
count    6.596000e+03
mean     1.924451e+06
std      3.417383e+06
min      2.000000e+05
25%      6.400000e+05
50%      1.000000e+06
75%      1.827425e+06
max      1.000000e+08
Name: price, dtype: float64


<h2>Clean surface</h2>

In [7]:
df['surface'] = df['surface'].astype(str).str.replace("m¬≤", "", regex=False)
df['surface'] = pd.to_numeric(df['surface'], errors='coerce')

<h2>convert room and bathroom to numeric</h2>

In [8]:
df['room'] = pd.to_numeric(df['room'], errors='coerce')
df['bathroom'] = pd.to_numeric(df['bathroom'], errors='coerce')

In [9]:
# Drop rows where category is Terrains et Fermes or Local et Bureaux
df = df[~df['property_type'].isin(['Terrains et fermes ', 'Local ' , 'Bureaux ', 'Autre Immobilier '])]

print(f"Remaining ads after category filter: {len(df)}")

Remaining ads after category filter: 5164


In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5164 entries, 0 to 6960
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          5164 non-null   float64
 1   room           3586 non-null   float64
 2   bathroom       3519 non-null   float64
 3   surface        2824 non-null   float64
 4   link           5164 non-null   object 
 5   property_type  5164 non-null   object 
 6   city           5164 non-null   object 
 7   zone           3697 non-null   object 
dtypes: float64(4), object(4)
memory usage: 363.1+ KB
None


<h2>data visualisation</h2>

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

KeyboardInterrupt: 

In [None]:
print(df.describe())

In [None]:
if 'property_type' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df, x='property_type', y='price')
    plt.xticks(rotation=45)
    plt.title("Price Distribution by Property Type")
    plt.ylabel("Price (MAD)")
    plt.ylim(0, 8000000)
    plt.show()
    

In [None]:
from scipy.stats import f_oneway  # ANOVA

# 1Ô∏è‚É£ Boxplot to visualize price by city
plt.figure(figsize=(14, 5))
sns.boxplot(x='city', y='price', data=df)
plt.xticks(rotation=90)
plt.ylim(0,12000000)
plt.title('Price Distribution by City', fontsize=16)
plt.ylabel('Price (MAD)')
plt.xlabel('City')
plt.show()

# 2Ô∏è‚É£ ANOVA test to check if differences are statistically significant
# Prepare data for ANOVA (list of prices per city)
city_price_groups = [group["price"].dropna().values for name, group in df.groupby("city")]

f_stat, p_val = f_oneway(*city_price_groups)

print(f"ANOVA F-statistic: {f_stat:.2f}")
print(f"P-value: {p_val:.5f}")

if p_val < 0.05:
    print("‚úÖ There is a statistically significant difference in prices between cities.")
else:
    print("‚ùå No statistically significant difference in prices between cities.")

In [None]:
city_price = df.groupby('city')['price'].median().sort_values(ascending=False)

plt.figure(figsize=(14, 4))
sns.barplot(x=city_price.index, y=city_price.values)
plt.xticks(rotation=90)
plt.title('Median Price by City')
plt.ylabel('Median Price (MAD)')
plt.show()


In [None]:
print(df['surface'].describe())
print(df['surface'].value_counts().sort_index().head(20))


In [None]:
surface_min = 20
surface_max = 7000

# Avant filtrage : pour info
print(f"Nombre d'annonces avant filtrage: {df.shape[0]}")

# Filtrage : on garde uniquement les lignes avec une surface raisonnable
df_cleaned = df[(df['surface'] >= surface_min) & (df['surface'] <= surface_max)]

# Apr√®s filtrage
print(f"Nombre d'annonces apr√®s filtrage: {df_cleaned.shape[0]}")

# V√©rif rapide : affichage des surfaces restantes
print(df_cleaned['surface'].describe())

In [None]:
# 6. Correlation heatmap (for numerical columns)
plt.figure(figsize=(6, 3))
numerical =df_cleaned.select_dtypes(include='number')

sns.heatmap(numerical.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 6. Correlation heatmap per property-type
plt.figure(figsize=(10, 3))
numerical =df_cleaned.select_dtypes(include='number')

sns.heatmap(df_cleaned.groupby("property_type")[['surface', 'price']].corr()
, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 7. Rooms vs Price
if 'room' in df.columns:
    plt.figure(figsize=(10, 3))
    sns.boxplot(x='room', y='price', data=df)
    plt.title("Price by Number of Rooms")
    plt.ylim(0, 17500000)
    plt.xlabel("Number of Rooms")
    plt.ylabel("Price (MAD)")
    plt.show()

In [None]:
import numpy as np

# Set Seaborn style
sns.set(style="whitegrid", palette="muted")

# ---------------------------
# 2. City vs property type (stacked count)
# ---------------------------
city_type_counts = df.groupby(["city", "property_type"]).size().unstack(fill_value=0)
city_type_counts.plot(kind="bar", stacked=True, figsize=(16, 6))
plt.title("Distribution of Property Types by City")
plt.xlabel("City")
plt.ylabel("Number of Listings")
plt.xticks(rotation=90)
plt.legend(title="Property Type")
plt.show()

# ---------------------------
# 3. Surface per city
# ---------------------------
plt.figure(figsize=(16, 6))
sns.boxplot(x="city", y="surface", data=df_cleaned)
plt.yscale("log")  # log scale helps show spread
plt.title("Surface Distribution by City")
plt.xlabel("City")
plt.ylabel("Surface (m¬≤, log scale)")
plt.xticks(rotation=90)
plt.show()

# ---------------------------
# 4. Log(price) vs. surface
# ---------------------------
plt.figure(figsize=(8, 6))
df_cleaned["log_price"] = np.log(df_cleaned["price"])
sns.regplot(x="surface", y="log_price", data=df_cleaned, scatter_kws={"alpha":0.3})
plt.title("Surface vs. Log(Price)")
plt.xlabel("Surface (m¬≤)")
plt.ylabel("Log(Price)")
plt.show()

<h1>predicting model </h1>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# =======================
# 1. Copy dataframe
# =======================
dfn = df.copy()

# =======================
# 2. Outlier removal (price)
# =======================
lower_bound = dfn["price"].quantile(0.01)
upper_bound = dfn["price"].quantile(0.99)
dfn = dfn[(dfn["price"] >= lower_bound) & (dfn["price"] <= upper_bound)]

# Drop unnecessary columns
dfn = dfn.drop(['zone', 'link'], axis=1, errors='ignore')

# =======================
# 3. Feature Engineering
# =======================
# Price per m¬≤


# =======================
# 4. One-hot encode categoricals
# =======================
dfn = pd.get_dummies(dfn, columns=["city", "property_type"], drop_first=True)

# =======================
# 5. KNN Imputation for missing values
# =======================
imputer = KNNImputer(n_neighbors=5)
dfn_imputed = pd.DataFrame(imputer.fit_transform(dfn), columns=dfn.columns)

# =======================
# 6. Log-transform target
# =======================
dfn_imputed["LogPrice"] = np.log1p(dfn_imputed["price"])

# =======================
# 7. Train/test split
# =======================
X = dfn_imputed.drop(columns=["price", "LogPrice"])
y = dfn_imputed["LogPrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =======================
# 8. Models
# =======================
models = {
    "Random Forest": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(n_estimators=300, learning_rate=0.1, max_depth=6, random_state=42, n_jobs=-1),
    "Linear Regression": LinearRegression()
}

# =======================
# 9. Train, Predict & Evaluate
# =======================
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)

    # Predict in log space, then invert the transform
    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_original = np.expm1(y_test)

    # safety: prices can't be negative
    y_pred = np.clip(y_pred, 0, None)

    # Metrics (compatible with older sklearn)
    mae = mean_absolute_error(y_test_original, y_pred)
    mse = mean_squared_error(y_test_original, y_pred)   # MSE
    rmse = np.sqrt(mse)                                  # RMSE = sqrt(MSE)
    r2 = r2_score(y_test_original, y_pred)

    results[name] = {"MAE": mae, "RMSE": rmse, "R¬≤": r2}

results_df = pd.DataFrame(results).T
print("\nüìä Model Performance Comparison:")
print(results_df)

# =======================
# 11. Feature Importance (Tree Models only)
# =======================
import matplotlib.pyplot as plt

for name in ["Random Forest", "XGBoost"]:
    model = models[name]
    importance = model.feature_importances_
    indices = np.argsort(importance)[::-1]
    top_n = 10  # show top 10 features

    plt.figure(figsize=(8, 5))
    plt.barh(range(top_n), importance[indices][:top_n], align='center', color='skyblue')
    plt.yticks(range(top_n), [X.columns[i] for i in indices][:top_n])
    plt.gca().invert_yaxis()
    plt.title(f"Top {top_n} Features - {name}")
    plt.show()


In [None]:
import joblib

# Choose the best model
best_model = models["XGBoost"]   # or "Random Forest" if you prefer

# Save model to file
joblib.dump(best_model, "best_model.pkl")

# Also save the processed dataset (for dashboard stats & encoding reference)
df.to_csv("cleaned_data.csv", index=False)
dfn_imputed.to_csv("encoded_data.csv", index=False)

print("‚úÖ Model and dataset saved successfully!")

In [None]:
print(dfn_imputed.columns.tolist())