# Cleaning the DV3F Dataset

In [1]:
# Import packages
import pandas as pd
import numpy as np
import os

import geopandas as gpd
from shapely import geometry

In [2]:
os.chdir("/Users/unaioyon/Desktop/DV3F_new")

## 0. Cleaning the DV3F geometries

In [None]:
dv3f = pd.read_csv("mod/mutations_paris_geometrie.csv")

In [None]:
dv3f.head(4)

In [None]:
# Converting WKT-stored geometries to Shapely (Geopandas-tractable) geometries
# The coding NaN generates a proble, so convert them to None before
dv3f = dv3f.where(pd.notnull(dv3f), None)
dv3f["geometry"] = gpd.GeoSeries.from_wkt(dv3f["st_astext"])

In [None]:
dv3f.info()

In [None]:
# Create a GeoDataFrame
dv3f = gpd.GeoDataFrame(dv3f, geometry = "geometry")
dv3f.drop("st_astext", axis = 1, inplace = True)

In [None]:
dv3f.info()

In [None]:
# Export the GeoDataFrame
dv3f.to_file("mod/mutations_paris_geometrie.shp")

In [None]:
dv3f[dv3f["geometry"].isna() == True].index

In [None]:
dv3f.plot()

## 1. Loading the final DV3F dataset

In [3]:
dv3f = gpd.read_file("mod/mutations_paris.shp")

In [7]:
dv3f["libtypbien"].unique()

array(['DEUX APPARTEMENTS ANCIENS', 'UN APPARTEMENT ANCIEN T2',
       'UN APPARTEMENT ANCIEN T4', 'UNE DEPENDANCE AUTRE', 'UN GARAGE',
       'BATI - INDETERMINE : Vefa sans descriptif',
       'UN APPARTEMENT ANCIEN T1', 'UN APPARTEMENT ANCIEN T3',
       'UN APPARTEMENT AGE INDETERMINE', 'ACTIVITE TERTIAIRE',
       'UN APPARTEMENT ANCIEN T5 ou +', 'DES DEPENDANCES',
       'UNE MAISON ANCIENNE', 'UN APPARTEMENT RECENT T1',
       'BATI MIXTE - LOGEMENT/ACTIVITE',
       'DES APPARTEMENTS DANS LE MEME IMMEUBLE',
       'BATI - INDETERMINE : Vente avec volume(s)',
       'APPARTEMENT INDETERMINE', 'UN APPARTEMENT VEFA OU NEUF T3',
       'UN APPARTEMENT VEFA OU NEUF T2', 'TERRAIN ARTIFICIALISE MIXTE',
       'DEUX APPARTEMENTS INDETERMINES', 'UN APPARTEMENT RECENT T2',
       'UN APPARTEMENT VEFA OU NEUF T1', 'TERRAIN DE TYPE TAB',
       'UN APPARTEMENT VEFA OU NEUF T4',
       'DEUX APPARTEMENTS VEFA OU NEUFS', 'UN APPARTEMENT RECENT T3',
       'UN APPARTEMENT RECENT T4', 'BATI MI

In [5]:
dv3f.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 427694 entries, 0 to 427693
Data columns (total 50 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   idmutation  427694 non-null  float64 
 1   idmutinvar  427694 non-null  object  
 2   idopendata  427694 non-null  object  
 3   idnatmut    427694 non-null  float64 
 4   codservch   324989 non-null  object  
 5   refdoc      324989 non-null  object  
 6   datemut     427694 non-null  object  
 7   anneemut    427694 non-null  float64 
 8   moismut     427694 non-null  float64 
 9   coddep      427694 non-null  float64 
 10  libnatmut   427694 non-null  object  
 11  vefa        427694 non-null  int64   
 12  valeurfonc  427544 non-null  float64 
 13  nblot       427694 non-null  float64 
 14  nbcomm      427694 non-null  float64 
 15  l_codinsee  427694 non-null  object  
 16  nbsection   427694 non-null  float64 
 17  l_section   427694 non-null  object  
 18  nbpar       4276