In [30]:
import pandas as pd
import re
from scipy.stats import zscore
import matplotlib.pyplot as plt

import seaborn as sns



ModuleNotFoundError: No module named 'matplotlib'

In [23]:
# Load the new dataset
new_file_path = '/home/corolo/Desktop/analysisAvito/data/avito_data.csv'
new_data = pd.read_csv(new_file_path)

# Display initial information about the dataset's structure
new_data_info = new_data.dtypes.to_frame(name="Data Type")
new_data_info["Non-Null Count"] = new_data.notnull().sum()
new_data_info["Missing Values (%)"] = (new_data.isnull().sum() / len(new_data)) * 100

# Display the first few rows to understand column contents
new_data_info, new_data.head()


(             Data Type  Non-Null Count  Missing Values (%)
 Unnamed: 0       int64            1760            0.000000
 title           object            1400           20.454545
 price           object            1400           20.454545
 city            object            1400           20.454545
 datetime        object            1394           20.795455
 nb_rooms       float64            1193           32.215909
 nb_baths        object            1181           32.897727
 surface_area    object             990           43.750000
 equipement      object             868           50.681818
 link            object            1760            0.000000,
    Unnamed: 0                                              title  \
 0           0   CMN-MA-1641 - Appartement à vendre à Val Fleurie   
 1           1           Appartement à vendre 125 m² à Casablanca   
 2           2  CMN-MA-1787 - Appartement à vendre à Les Hôpitaux   
 3           3       Appartement à vendre 3 chambres Salon à Ca

In [24]:
# Function to clean and convert the 'price' column to numerical format
def clean_price(price):
    if isinstance(price, str):
        price = re.sub(r'[^\d,]', '', price)  # Remove non-numeric characters except commas
        price = price.replace(',', '')        # Remove commas for integer conversion
        try:
            return float(price)
        except ValueError:
            return None  # If conversion fails, return None
    return price

# Apply the cleaning function to the 'price' column
new_data['price_cleaned'] = new_data['price'].apply(clean_price)

# Display summary statistics of the cleaned 'price' column
price_summary = new_data['price_cleaned'].describe()
price_summary


count    1.193000e+03
mean     2.014280e+06
std      1.872560e+07
min      3.500000e+01
25%      5.600000e+05
50%      1.050000e+06
75%      1.720000e+06
max      6.279000e+08
Name: price_cleaned, dtype: float64

In [25]:
# Function to impute 'type_de_vente' based on keywords in 'title'
def infer_type_de_vente(title, current_type):
    if pd.isnull(current_type):
        # Check if any of the rental keywords appear in the title
        if any(keyword in str(title).lower() for keyword in ["louer", "loué", "location"]):
            return "louer"
    return current_type

# Apply the function to infer missing 'type_de_vente' values
new_data['type_de_vente'] = new_data.apply(lambda row: infer_type_de_vente(row['title'], row.get('type_de_vente', None)), axis=1)

# Fill any remaining NaN values in 'type_de_vente' with 'louer'
new_data['type_de_vente'].fillna('louer', inplace=True)

# Check if any NaN values remain in 'type_de_vente'
type_de_vente_missing_final = new_data['type_de_vente'].isnull().sum()
type_de_vente_missing_final


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['type_de_vente'].fillna('louer', inplace=True)


np.int64(0)

In [26]:
# Convert 'nb_baths' to numeric, handling non-numeric entries
new_data['nb_baths'] = pd.to_numeric(new_data['nb_baths'], errors='coerce')

# Impute missing values
# For 'nb_rooms' and 'nb_baths', using median imputation
new_data['nb_rooms'].fillna(new_data['nb_rooms'].median(), inplace=True)
new_data['nb_baths'].fillna(new_data['nb_baths'].median(), inplace=True)

# For 'surface_area', using median imputation due to high missing percentage
new_data['surface_area'] = pd.to_numeric(new_data['surface_area'], errors='coerce')
new_data['surface_area'].fillna(new_data['surface_area'].median(), inplace=True)

# Verify if any NaN values remain in the imputed columns
remaining_missing_values = new_data[['nb_rooms', 'nb_baths', 'surface_area']].isnull().sum()
remaining_missing_values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['nb_rooms'].fillna(new_data['nb_rooms'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['nb_baths'].fillna(new_data['nb_baths'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

nb_rooms        0
nb_baths        0
surface_area    0
dtype: int64

In [27]:


# Calculate Z-scores for outlier detection in the 'price_cleaned', 'nb_rooms', 'nb_baths', and 'surface_area' columns
outlier_columns = ['price_cleaned', 'nb_rooms', 'nb_baths', 'surface_area']
for col in outlier_columns:
    # Apply Z-score method to identify outliers (Z > 3)
    new_data[f'{col}_zscore'] = zscore(new_data[col].fillna(new_data[col].mean()))
    # Cap outliers beyond the 99th percentile
    cap = new_data[col].quantile(0.99)
    new_data[col] = new_data[col].apply(lambda x: min(x, cap) if pd.notnull(x) else x)

# Verify by displaying capped values summary
capped_values_summary = new_data[outlier_columns].describe()
capped_values_summary


Unnamed: 0,price_cleaned,nb_rooms,nb_baths,surface_area
count,1193.0,1760.0,1760.0,1760.0
mean,1284602.0,2.259091,1.746591,101.844398
std,995593.3,0.606884,0.541081,39.868676
min,35.0,1.0,0.0,11.0
25%,560000.0,2.0,1.0,88.0
50%,1050000.0,2.0,2.0,96.0
75%,1720000.0,3.0,2.0,104.0
max,5512800.0,4.0,3.0,288.23


In [29]:
# Standardize numerical columns for consistency
from sklearn.preprocessing import StandardScaler

# Select columns to standardize
scaler = StandardScaler()
new_data[['price_cleaned', 'nb_rooms', 'nb_baths', 'surface_area']] = scaler.fit_transform(new_data[['price_cleaned', 'nb_rooms', 'nb_baths', 'surface_area']])

# Display the summary of standardized columns
standardized_summary = new_data[['price_cleaned', 'nb_rooms', 'nb_baths', 'surface_area']].describe()
standardized_summary


Unnamed: 0,price_cleaned,nb_rooms,nb_baths,surface_area
count,1193.0,1760.0,1760.0,1760.0
mean,-2.322814e-16,-2.422305e-16,1.2111520000000002e-17,-1.150595e-16
std,1.000419,1.000284,1.000284,1.000284
min,-1.290793,-2.07527,-3.228885,-2.279238
25%,-0.728114,-0.4270412,-1.380206,-0.3473487
50%,-0.2357388,-0.4270412,0.4684719,-0.1466329
75%,0.437509,1.221188,0.4684719,0.05408293
max,4.248694,2.869417,2.31715,4.676317
