# Un notebook pour filtrer les données

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans, DBSCAN
from sklearn.compose import ColumnTransformer
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
import random
from collections import defaultdict

### Garder les stations qui sont dans les zones éligibles

In [2]:
df = pd.read_csv("s3://jedha-final-project-jrat/zones.csv")

In [3]:
df.head()

Unnamed: 0,Y,X,Latitude,Longitude,Color
0,79,571,50.198832,3.355932,"rgb(206,136,255)"
1,80,571,50.188113,3.355932,"rgb(206,136,255)"
2,80,579,50.188113,3.480975,"rgb(208,142,255)"
3,81,572,50.177394,3.371562,"rgb(206,136,255)"
4,83,577,50.155957,3.449714,"rgb(208,142,255)"


In [4]:
def is_in_zone(coords: tuple[float, float],
               zone: list[tuple[float, float]]) -> bool:
    """Verify if a place is in a zone

    Args:
        coords (tuple[float, float]): position station meteo ou parc eolien
        zone (list[tuple[float, float]]): zone d'autorisation

    Returns:
        bool

    Example:
        coords = (43.02, 1.05)
        zone = [(50.1, -3), (48.3, 5), (32.5, -1), (25.2, 3), (40.0, 2.5)]
        result = is_in_zone(coords, zone)
        print(result)
    """
    x, y = coords
    n = len(zone)
    inside = False

    p1x, p1y = zone[0]
    for i in range(n + 1):
        p2x, p2y = zone[i % n]
        if y > min(p1y, p2y):
            if y <= max(p1y, p2y):
                if x <= max(p1x, p2x):
                    if p1y != p2y:
                        xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
                    if p1x == p2x or x <= xinters:
                        inside = not inside
        p1x, p1y = p2x, p2y

    return inside

In [5]:
global_zone = df[["Latitude", "Longitude"]]
global_zone

Unnamed: 0,Latitude,Longitude
0,50.198832,3.355932
1,50.188113,3.355932
2,50.188113,3.480975
3,50.177394,3.371562
4,50.155957,3.449714
...,...,...
7799,42.567189,9.092288
7800,42.567189,9.295484
7801,42.556470,9.107919
7802,42.492159,9.076658


In [6]:
coordinates = [(random.uniform(40, 50), random.uniform(-10, 10)) for _ in range(500)]
global_zone = list(global_zone.itertuples(index=False, name=None))

In [7]:
fig = px.scatter_mapbox(global_zone, lat=0, lon=1,
                        zoom=5,
                        height=800)

for place in coordinates:
    result = is_in_zone(place, global_zone)

    if result:
        color = "green"
    else:
        color = "red"

    fig.add_scattermapbox(lat=[place[0]], lon=[place[1]],
                        mode="markers",
                        marker=dict(size=10, color=color))

fig.update_layout(mapbox_style="open-street-map")
fig.show()

La méthode de ray casting n'est pas très efficace quand on donne toutes les zones en une fois, ce qui est normal.
Nous allons donc esssayer la méthode k-means pour grouper les zones en sous-zones, voire englober directement les stations météo par la suite.

In [8]:
k_max = 20
wcss = []
silhouette = []

for k in range(2, k_max):
    km = KMeans(n_clusters=k, random_state=0, init="k-means++")
    km.fit(global_zone)
    wcss.append(km.inertia_)
    silhouette.append(silhouette_score(global_zone, km.labels_))

In [9]:
def plot_this(list_: list, k_max: int, type: str="silhouette") -> None:
    frame = pd.DataFrame(list_)
    k_frame = pd.Series([k for k in range(2, k_max)])

    if type == "silhouette":
        fig= px.bar(frame,
                    x=k_frame,
                    y=frame.iloc[:,-1])

    elif type == "inertia":
        fig= px.line(frame,
                     x=k_frame,
                     y=frame.iloc[:,-1])

    else:
        raise ValueError()

    fig.update_layout(yaxis_title=type,
                          xaxis_title="clusters",
                          title=f"{type} by cluster")

    fig.show()

In [10]:
plot_this(wcss, k_max, type="inertia")
plot_this(silhouette, k_max, type="silhouette")

In [11]:
km = KMeans(n_clusters=50, random_state=0, init="k-means++")
km.fit(global_zone)

df["cluster"] = km.labels_

fig = px.scatter_mapbox(df, lat="Latitude", lon="Longitude", color="cluster", mapbox_style="carto-positron")
fig.show()

In [12]:
df.head()

Unnamed: 0,Y,X,Latitude,Longitude,Color,cluster
0,79,571,50.198832,3.355932,"rgb(206,136,255)",17
1,80,571,50.188113,3.355932,"rgb(206,136,255)",17
2,80,579,50.188113,3.480975,"rgb(208,142,255)",17
3,81,572,50.177394,3.371562,"rgb(206,136,255)",17
4,83,577,50.155957,3.449714,"rgb(208,142,255)",17


In [13]:
clusters = defaultdict(list)

for _, entry in df.iterrows():
    lat_lon_tuple = (entry["Latitude"], entry["Longitude"])
    clusters[entry["cluster"]].append(lat_lon_tuple)

zones = list(clusters.values())

zones

[[(50.19883167500197, 3.355932073245701),
  (50.188113075731, 3.355932073245701),
  (50.188113075731, 3.480975261850152),
  (50.17739447646005, 3.3715624718212576),
  (50.15595727791811, 3.4497144646990394),
  (50.08092708302136, 3.480975261850152),
  (49.87727369687303, 3.355932073245701),
  (49.86655509760206, 3.6685400447568304),
  (49.81296210124724, 3.4966056604257085),
  (49.77008770416338, 3.62164884903016),
  (49.759369104892414, 3.574757653303492),
  (49.74865050562145, 3.4028232689723703),
  (49.72721330707952, 3.559127254727936),
  (49.69505750926663, 3.480975261850152),
  (49.69505750926663, 3.652909646181274),
  (49.69505750926663, 3.6685400447568304),
  (49.69505750926663, 3.6841704433323863),
  (49.69505750926663, 3.699800841907943),
  (49.68433890999566, 3.246519283216805),
  (49.68433890999566, 3.652909646181274),
  (49.6736203107247, 3.293410478943476),
  (49.6736203107247, 3.512236059001265),
  (49.6736203107247, 3.559127254727936),
  (49.6736203107247, 3.71543124048

In [14]:
fig = px.scatter_mapbox(global_zone, lat=0, lon=1,
                        zoom=5,
                        height=800)

found_coordinates = []

for zone in zones:
    print("start zone")
    for place in coordinates:
        if place in found_coordinates:
            continue
        if is_in_zone(place, zone):
            found_coordinates.append(place)
            color = "green"
        else:
            color = "red"

        fig.add_scattermapbox(lat=[place[0]], lon=[place[1]],
                            mode="markers",
                            marker=dict(size=10, color=color))

fig.update_layout(mapbox_style="open-street-map")
fig.show()

start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
start zone
