**1 - Fonctions autre projet**

In [None]:
# Fonctions d'entraînement du modèle

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

def train_model(training_data, params, features):

    model = {}
    df_features, target = split_target_features(training_data, "valeur_fonciere")

    encoder_model = fit_encode_features(df_features, features)
    x_train, encoded_features_name = transform_encode_features(
        encoder_model, df_features, features
    )

    regression_model = RandomForestRegressor(params)
    regression_model.fit(x_train[encoded_features_name].values, target.values)

    model["encoder"] = encoder_model
    model["regressor"] = regression_model
    model["feature_name"] = features

    return model, params


def split_target_features(data: pd.DataFrame, target_name: str):

    features = [feat for feat in data.columns if feat != target_name]
    assert target_name not in features
    return data[features], data[target_name]


def fit_encode_features(df_features: pd.DataFrame, features_name: list):

    cat_features = df_features.select_dtypes("object")
    encoders = dict()
    for col in cat_features.columns:
        if col not in features_name:
            continue
        encoder = LabelEncoder()
        encoder = encoder.fit(list(df_features[col]) + ["unknown"])
        encoders[col] = encoder

    return encoders


def transform_encode_features(encoders, df_features: pd.DataFrame, features: list, suffix: str = "_cat"):

    encoded_features_name = features.copy()
    for col in encoders.keys():
        if col not in features:
            continue
        col_cat = col + suffix
        new_df_col = list(df_features[col])
        for unique_item in np.unique(df_features[col]):
            if unique_item not in encoders[col].classes_:
                new_df_col = ["unknown" if x == unique_item else x for x in new_df_col]
        df_features[col_cat] = encoders[col].transform(new_df_col)
        df_features[col_cat] = df_features[col_cat].astype("category")
        encoded_features_name.remove(col)
        encoded_features_name += [col_cat]

    return df_features, encoded_features_name


def predict_price(df_features, model, features):
    """
    Predict demand by encoding categorical variables and using the regressor model.
    """
    x_test, encoded_features_name = transform_encode_features(
        model["encoder"], df_features.copy(), features
    )
    predicted_data = model["regressor"].predict(x_test[encoded_features_name].values)
    return predicted_data


def mae(y_true: pd.Series, y_pred: pd.Series, in_percent: bool = False) -> float:

    assert len(y_true) > 0, "MAE need at least one actual"
    assert len(y_pred) > 0, "MAE needs at least one prediction"

    if in_percent:
        return 100 * np.sum(np.abs(y_pred - y_true)) / len(y_true)

    return np.sum(np.abs(y_pred - y_true)) / len(y_true)


def evaluate(y, y_hat, method):

    if method == "mae":
        score = mae(y, y_hat)
    elif method == "wmape":
        score = wmape(y, y_hat)
    elif method == "smape":
        score = smape(y, y_hat)
    elif method == "r2":
        score = r2(y, y_hat)
    else:
        raise ValueError(f"Unknown method for backtest: {method}")

    return score


def backtest_model(test_data, model, features):

    df_feature, target = split_target_features(test_data, "valeur_fonciere")
    predicted_data = predict_price(df_feature, model, features)
    score = evaluate(target, predicted_data, "mae")
    return score

In [None]:
# Train - test split

from sklearn.model_selection import train_test_split

data = data.drop("prix_m2", axis = 1)

training_data, test_data = train_test_split(data, test_size = 0.2)
training_data.shape

In [None]:
# Colonnes retenues pour la modélisation

features_name = ['valeur_fonciere', 'type_local', 'surface_reelle_bati', 'nombre_pieces_principales', 
                 'nature_culture', 'surface_terrain', 'longitude', 'latitude', 'code_departement', 'prix_m2']
max_depth = 7

model_ensae, params_ensae = train_model(training_data, max_depth, features_name)

score = backtest_model(test_data, model_ensae, features_name)

print(f"Le score SMAPE avec les features sélectionnés est : {score}")

In [None]:
"""
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

y = data["valeur_fonciere"].values

X = data.drop(["valeur_fonciere", "prix_m2"], axis = 1).values

model = RandomForestRegressor(n_estimators = 30, min_samples_split = 5)

model.fit(X,y)
"""

**2 - Eloi Preprocessing**

In [None]:
# Les fichiers sont renommés valeursfoncieres-(année)

def preprocessing(year):
    # path = 'Users\victo\Desktop\ENSAE\Python\tokenisation-immo\valeursfoncieres-'+str(year)+'.csv'
    table = pd.read_csv('data/valeursfoncieres-'+ str(year) +'.csv', sep = ',')
    
    a = [2, 4, 5, 6, 7, 8, 9, 10, 12, 29, 31, 32, 38, 39, 40] 
    # vecteur des n° des colonnes qui nous intéressent
    
    a = [k - 1 for k in a]
    
    table = table[table.columns[a]]

    ventes_types = list(table[table.columns[1]].unique())
    
    table = table[(table['nature_mutation'] == 'Vente') | 
                  (table['nature_mutation'] == "Vente en l'état futur d'achèvement")]

    table = table[(table['nombre_lots'] == 0) | (table['nombre_lots'] == 1)]

    table = table[table['surface_reelle_bati']!=0]

    table = table[table['surface_reelle_bati'].notna()]

    table = table[table['valeur_fonciere'].notna()]
    
    table['adresse_numero'] = table['adresse_numero'].fillna('0').astype(int)
    table['adresse_suffixe'] = table['adresse_suffixe'].fillna(' ')
    table['adresse_code_voie'] = table['adresse_code_voie'].fillna(' ')
    table['adresse_nom_voie'] = table['adresse_nom_voie'].fillna(' ')
    table['code_postal'] = table['code_postal'].fillna('0').astype(int)
    table['nom_commune'] = table['nom_commune'].fillna(' ')

    table['Adresse'] = table['adresse_numero'].astype(str) + ' ' + table['adresse_suffixe'] + ' ' + table['adresse_code_voie'] + ' ' + table['adresse_nom_voie'] + ' ' + table['nom_commune'] + ' ' + table['code_postal'].astype(str) + ' ' + 'France'    
    
    table.reset_index(drop = True, inplace = True)
  
    return table

In [None]:
test = preprocessing(2017)

In [None]:
display(len(test))
test.head()

In [None]:
test.drop_duplicates(subset = ['date_mutation', 'valeur_fonciere', 'adresse_nom_voie'], keep='last')
test.head()

In [None]:
# Fonction pour mettre dans 'dicte' les 4 csv après preprocessing

dicte = {}
years = [2017,2018,2019,2020]
for k in years:
    dicte[k-2017] = preprocessing(k)

In [None]:
output = pd.DataFrame()
for k in range(4):
    output = pd.concat((output,dicte[k]), axis = 0)

In [None]:
output = output[['date_mutation', 'nature_mutation', 'valeur_fonciere', 'type_local', 
                 'surface_reelle_bati', 'surface_terrain', 'Adresse', 'latitude', 'longitude']]
output.sample(10)

In [None]:
# fonction pour exporter le final en div=10 petits fichiers

div=10
for k in range(div):
    name='final-'+str(1+k)+'.csv'
    df=output[k*round(len(output)/div):(k+1)*round(len(output)/div)]
    df.to_csv(name,index=False,header=True)

**2-1 création du modèle pour obtenir le prix moyen du voisinage (BallTree)**

**2.1.1 Importation des données**

data_vf : base de données de l'INSEE sur l'historique des valeurs foncières

data_communes : base de données de l'INSEE sur les communes françaises

In [None]:
# importation de data_vf
div = 10
data_vf = pd.read_csv('data/final-1.csv', sep = ',')
for k in range(1, div):
    name = 'data/final-' + str(1 + k) + '.csv'
    data_vf = pd.concat([data_vf, pd.read_csv(name, sep = ',')])
    
# importation de data_communes
data_communes = pd.read_csv('communes_insee.csv', sep = ';')

# importation de data_regions
data_regions = pd.read_csv('communes-departement-region.csv', sep = ',')

**2.1.2 Visualisation du dataframe**

In [None]:
# Visualisation de data_vf
print("Nombre de lignes de data data_vf :")
print(len(data_vf))
print("Nombre de colonnes de data data_vf :")
print(len(data_vf.columns))
print("Visualisation de data_vf :")
data_vf.sample(3)

In [None]:
# Visualisation de data_communes
print("Nombre de lignes de data data_communes :")
print(len(data_communes))
print("Nombre de colonnes de data data_communes :")
print(len(data_communes.columns))
print("Visualisation de data_communes :")
data_communes["REG"].unique()

In [None]:
# Visualisation de data_regions
print("Nombre de lignes de data data_regions :")
print(len(data_regions))
print("Nombre de colonnes de data data_regions :")
print(len(data_regions.columns))
print("Visualisation de data_regions :")
display(data_regions.sample(3))
len(data_regions["code_region"].unique())

**2.1.3 Création d'une BD inédite**

In [None]:
df_co2 = pd.read_csv("https://koumoul.com/s/data-fair/api/v1/datasets/igt-pouvoir-de-rechauffement-global/convert")
df_co2.sample(3)

In [None]:
data_vf[data_vf.index.duplicated()]

In [None]:
data_new = data_vf.copy()
data_new.reset_index(drop=True, inplace=True)

# prix_m2
data_new['prix_m2'] = data_vf['surface_terrain'] / data_vf['valeur_fonciere']

# cp
data_new['cp'] = [x[-12:][:5] for x in data_vf["Adresse"].values]
# data_new.set_index('cp')

# dep
data_new['dep'] = [x[:2] for x in data_new['cp'].values]

# region

In [None]:
data_new

In [None]:
# code_region
x = data_new["cp"].loc[2]
x

In [None]:
data_regions["nom_region"].iloc[data_new[]]

In [None]:
regions = 13
appart_old[‘Distance moyenne’]=np.zeros(len(appart_old))
appart_old[‘Indices voisins’]=np.zeros(len(appart_old))
models={}
regions=appart_old.Région.unique()
for k in range(len(regions)):
    name=’appart_’+regions[k]
    data=appart_old[appart_old.Région==regions[k]]
    data=data.reset_index(drop=True)
    models[k]=BallTree(data[[‘latitude_r’, ‘longitude_r’]].values, leaf_size=2, metric=’haversine’)
    save_obj(models[k], name)