In [1]:
import pandas as pd
from extra_utils import colors

cities = [
    'chennai',
    'delhi',
    'hyderabad',
    'kolkata',
    'mumbai'
]

def ds_file_name(cityname):
    return f"complete-dataset/{cityname}-metro-data.csv"

In [2]:
def get_mst_cluster(df, cityname, points = True):
    from ml_utils import find_best_k

    city_to_best_k = {
        "chennai": 20,
        "delhi": 100,
        "hyderabad": 30,
        "kolkata": 15,
        "mumbai": 80
    }

    # k is dicided here
    k = len(df.index)
    print(f"total stations = {k}")
    k = min(city_to_best_k[cityname], k)
    print(f"k = {k}")

    # chennai: 20
    # delhi: 100
    # 

    from sklearn.cluster import KMeans
    
    model = KMeans(n_clusters = k, init = 'k-means++', random_state = 1729)
    model.fit(df[["Latitude","Longitude"]])

    coord_clusters = model.cluster_centers_
    # print(f"clusters:\n{coord_clusters}")
    cluster_vals = model.predict(df[["Latitude","Longitude"]])

    from haversine import haversine, Unit
    cluster_lat, cluster_long = [], []
    dist = []
    idx = 0

    point_lat = df['Latitude'].to_numpy()
    point_long = df['Longitude'].to_numpy()

    # print(point_lat)
    # print(point_long)

    for cluster_val in cluster_vals:
        (x, y) = coord_clusters[cluster_val]
        cluster_lat.append(x)
        cluster_long.append(y)
        dist.append(haversine((x, y), (point_lat[idx], point_long[idx])))
        idx += 1

    df['Cluster Latitude'] = cluster_lat
    df['Cluster Longitude'] = cluster_long
    df['Cluster'] = cluster_vals
    df['Cluster Distance'] = dist

    import pandas as pd

    cdf = pd.DataFrame()
    cdf[['Latitude', 'Longitude']] = coord_clusters
    cdf['Station Name'] = [f"{i + 1}" for i in range(k)]

    from mst_utils import gen_mst
    mst_map = gen_mst(cityname, cdf)

    if(points):
        import folium
        idx = 0
        for (x, y) in model.cluster_centers_:
            # With-in-Sum-of-Squares (WSS): WSS is the total distance of data points from their respective cluster centroids.
            html=f"""
                <link href='http://fonts.googleapis.com/css?family=Roboto' rel='stylesheet' type='text/css'>
                <div style = "font-family: 'Roboto', sans-serif;">
                    <h4> Station {idx + 1}</h4>
                <div>
                """
            iframe = folium.IFrame(html=html, width = 200, height = 50)
            popup = folium.Popup(iframe, max_width=2650)
            folium.Marker(
                location = [x, y],
                popup = popup,
                icon = folium.Icon(
                    color = colors[idx]
                )
            ).add_to(mst_map)
            idx += 1
    
    return mst_map

In [3]:
def display_mst_using_kmeans(cityname):
    return get_mst_cluster(pd.read_csv(ds_file_name(cityname)), cityname, False)

In [4]:
city_no = 0
print(cities[city_no].capitalize())
display_mst_using_kmeans(cities[city_no])

Chennai


  from .autonotebook import tqdm as notebook_tqdm


total stations = 40
k = 20
total length of metro line = 41.00400258872887 km


In [5]:
city_no += 1
print(cities[city_no].capitalize())
display_mst_using_kmeans(cities[city_no])

Delhi
total stations = 232
k = 100
total length of metro line = 247.59825322223583 km


In [6]:
city_no += 1
print(cities[city_no].capitalize())
display_mst_using_kmeans(cities[city_no])

Hyderabad
total stations = 57
k = 30
total length of metro line = 52.92208653465919 km


In [7]:
city_no += 1
print(cities[city_no].capitalize())
display_mst_using_kmeans(cities[city_no])

Kolkata
total stations = 33
k = 15
total length of metro line = 37.12222249950771 km


In [8]:
city_no += 1
print(cities[city_no].capitalize())
display_mst_using_kmeans(cities[city_no])

Mumbai
total stations = 176
k = 80
total length of metro line = 234.9591506979705 km
