In [None]:
import numpy as np
import pandas as pd
import sys
import os
import shutil
from sklearn.metrics import silhouette_score

<h2>데이터 전처리<h2>

In [None]:
def read_text_files_to_dataframe(folder_path):
    
    latitude = []
    longitude = []
    date = []
    time = []

    # 폴더 내의 모든 텍스트 파일을 읽음
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            
            # 텍스트 파일을 DataFrame에 추가

            with open(file_path, 'r') as file:
                data = file.read()

            data = data.split('\n')[6:-1]
            for i in range(len(data)):
                line = data[i].split(',')
                latitude.append(line[0])
                longitude.append(line[1])
                date.append(line[5])
                time.append(line[6])

    df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
    df['datetime'] = df['datetime'].dt.floor('T')
    df = df.drop(['date', 'time'], axis=1)
    df = df.drop_duplicates(['datetime'], ignore_index=True)

    return df

In [None]:
folder_path = "C:\\Users\\sk002\\OneDrive\\바탕 화면\\학교\\Yoodori\\Geolife Trajectories 1.3\\Data\\000\\Trajectory"

dataframe = read_text_files_to_dataframe(folder_path)

dataframe

<h2>의미장소 추출<h2>

In [None]:
from pyclustering.cluster.gmeans import gmeans
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster import cluster_visualizer_multidim
import folium

In [None]:
def gmeans_(df):
    # 두 열을 선택하고 넘파이 배열로 변환
    selected_columns = ['latitude', 'longitude']
    result_list = df[selected_columns].values.tolist()
    
    gmeans_instance = gmeans(result_list).process()

    # Extract clustering results: clusters and their centers
    clusters = gmeans_instance.get_clusters()
    centers = gmeans_instance.get_centers()

    return clusters, centers
    

In [None]:
def map(centers, i):

    # 예제 데이터 생성 (서울의 위도, 경도)
    locations = centers

    # 지도의 중심 좌표 설정
    map_center = [sum(x[0] for x in locations) / len(locations), sum(x[1] for x in locations) / len(locations)]

    # Folium 맵 생성
    mymap = folium.Map(location=map_center, zoom_start=12)

    # 위치 데이터를 지도에 마커로 표시
    for loc in locations:
        folium.Marker(location=loc, popup='Location').add_to(mymap)

    # 지도를 HTML 파일로 저장하거나 주피터 노트북에서 바로 표시
    file_name = f'map{i}.html'
    mymap.save(file_name)


In [None]:
def fileReader(folder_path, filename):
    latitude = []
    longitude = []
    date = []
    time = []
    file_path = os.path.join(folder_path, filename)
            
    # 텍스트 파일을 DataFrame에 추가

    with open(file_path, 'r') as file: 
        data = file.read()

    data = data.split('\n')[6:-1]
    for i in range(len(data)):
        line = data[i].split(',')
        latitude.append(line[0])
        longitude.append(line[1])
        date.append(line[5])
        time.append(line[6])

    df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})

    return df

In [None]:
def app(folder_path):
    
    j = 0
    data_dict = {}
    # 폴더 내의 모든 텍스트 파일을 읽음
    for filename in os.listdir(folder_path):

        df = fileReader(folder_path, filename)

        df['latitude'] = df['latitude'].astype(float)
        df['longitude'] = df['longitude'].astype(float)
        df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
        df['datetime'] = df['datetime'].dt.floor('T')
        df = df.drop(['date', 'time'], axis=1)
        df = df.drop_duplicates(['datetime'], ignore_index=True)

        clusters, centers = gmeans(df)

        data_dict[j] = pd.DataFrame({"clusters":clusters, "centers":centers})
        
        for k in range(len(data_dict[j].clusters)):
            if (len(data_dict[j].clusters[k]) < 6):
                data_dict[j].drop(index=k, inplace=True)
        data_dict[j] = data_dict[j].sort_index(axis=1)

        map(centers, j)
        
        j += 1
    return data_dict

In [None]:
#folder_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\000\Trajectory'
#data_dict = app(folder_name)

In [None]:
def fileReader(filename):

    latitude = []   # 위도
    longitude = []  # 경도
    date = []       # 날짜
    time = []       # 시간

    with open(filename, 'r') as file:
        data = file.read()

    # 데이터에 불필요한 부분 제거
    # 추후 데이터 형식에 따라 수정 필요 *
    data = data.split('\n')[6:-1]
    for i in range(len(data)):
        line = data[i].split(',')
        latitude.append(line[0])    # 위도
        longitude.append(line[1])   # 경도
        date.append(line[5])        # 날짜
        time.append(line[6])        # 시간

    df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})

    return df

In [None]:
def gmeansFit(df):
    # 두 열을 선택하고 넘파이 배열로 변환
    selectedColumns = ['latitude', 'longitude']
    resultList = df[selectedColumns].values.tolist()    # 리스트로 변환
    
    gmeansInstance = gmeans(resultList).process()       # 클러스터링

    centers = gmeansInstance.get_centers()              # 클러스터의 중심 (의미장소)
    clusters = gmeansInstance.get_clusters()            # 분류된 클러스터


    return clusters, centers

In [None]:
def gmeansFunc(file_name):
    
    j = 0
    data_dict = {}

    df = fileReader(file_name)

    # 위도 경도 데이터 형식 변경
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    # 날짜, 시간 데이터 병합
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
    df['datetime'] = df['datetime'].dt.floor('T')
    df = df.drop(['date', 'time'], axis=1)
    # 1분 단위로 데이터 병합
    df = df.drop_duplicates(['datetime'], ignore_index=True)

    # 의미장소 추출
    clusters, centers = gmeansFit(df)
        
    # 딕셔너리 형식으로 데이터 저장
    data_dict[j] = pd.DataFrame({"clusters":clusters, "centers":centers})
        
    # 클러스터가 10개 미만인 의미장소 제거
    for k in range(len(data_dict[j].clusters)):
        if (len(data_dict[j].clusters[k]) < 10):
            data_dict[j].drop(index=k, inplace=True)
    data_dict[j] = data_dict[j].sort_index(axis=1)
        
    j += 1
    return data_dict

In [None]:
file_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\000\Trajectory\20090403011657.txt'
data_dict = gmeansFunc(file_name)

<h2>서버에 올릴 코드<h2>

In [76]:
from pyclustering.cluster.gmeans import gmeans
from pyclustering.cluster import cluster_visualizer
from pyclustering.cluster import cluster_visualizer_multidim
from collections import Counter
import numpy as np
import pandas as pd
import os
import folium

In [37]:
def fileReader(filename):

    latitude = []   # 위도
    longitude = []  # 경도
    date = []       # 날짜
    time = []       # 시간

    with open(filename, 'r') as file:
        data = file.read()

    # 데이터에 불필요한 부분 제거
    # 추후 데이터 형식에 따라 수정 필요 *
    data = data.split('\n')[6:-1]
    for i in range(len(data)):
        line = data[i].split(',')
        latitude.append(line[0])    # 위도
        longitude.append(line[1])   # 경도
        date.append(line[5])        # 날짜
        time.append(line[6])        # 시간

    df = pd.DataFrame({"latitude":latitude, "longitude":longitude, "date":date, "time":time})

    
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S')
    df['datetime'] = df['datetime'].dt.floor('T')
    # 시간대와 요일 추가
    # 시간대 형식 : f00t04 f20t24
    # 4시간 단위로 분리
    df['hour_block'] = 'f' + ((df['datetime'].dt.hour) // 4 * 4).astype(str).str.zfill(2) + 't' + ((df['datetime'].dt.hour + 4) // 4 * 4).astype(str).str.zfill(2)
    df['day_of_week'] = df['datetime'].dt.day_name()
    df = df.drop(['date', 'time'], axis=1)
    df = df.drop_duplicates(['datetime'], ignore_index=True)

    return df

In [38]:
# 의미장소 추출
def gmeansFit(df):
    # 두 열을 선택하고 넘파이 배열로 변환
    selectedColumns = ['latitude', 'longitude']
    resultList = df[selectedColumns].values.tolist()    # 리스트로 변환
    
    gmeansInstance = gmeans(resultList).process()       # 클러스터링

    centers = gmeansInstance.get_centers()              # 클러스터의 중심 (의미장소)
    clusters = gmeansInstance.get_clusters()            # 분류된 클러스터

    

    return clusters, centers

In [39]:
def gmeansFunc(df):


    clusters, centers = gmeansFit(df)

    data_df = pd.DataFrame({"clusters":clusters, "centers":centers})
        
    for k in range(len(data_df.clusters)):
        if (len(data_df.clusters[k]) < 10):
            data_df.drop(index=k, inplace=True)
    data_df = data_df.sort_index(axis=1)
    data_df = data_df.reset_index(drop=True)
    

    return data_df

In [101]:
def gmeansFunc(df):


    clusters, centers = gmeansFit(df)

    data_df = pd.DataFrame({"clusters":clusters, "centers":centers})
        
    for k in range(len(data_df.clusters)):
        if (len(data_df.clusters[k]) < 10):
            data_df.drop(index=k, inplace=True)
    data_df = data_df.sort_index(axis=1)
    data_df = data_df.reset_index(drop=True)
    
    df['clusterNo'] = -1
    for i in range(len(data_df)):
        for j in range(len(data_df['clusters'].iloc[i])):
            k = data_df['clusters'].iloc[i][j]
            df['clusterNo'].iloc[k] = i

    df = df[df['clusterNo'] != -1]


    data_df['hour_block'] = 0
    data_df['day_of_week'] = 0
    for i in range(max(df['clusterNo'])+1):
        
        counter = Counter(df[df['clusterNo'] == i]['hour_block'])
        most_hour_value = counter.most_common(1)[0][0]

        counter = Counter(df[df['clusterNo'] == i]['day_of_week'])
        most_day_value = counter.most_common(1)[0][0]

        data_df['hour_block'].iloc[i] = most_hour_value
        data_df['day_of_week'].iloc[i] = most_day_value


    return data_df

In [102]:
file_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20090403011657.txt'
df1 = fileReader(file_name)
data_df_1 = gmeansFunc(df1)
data_df_1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a 

Unnamed: 0,centers,clusters,hour_block,day_of_week
0,"[39.90903863636363, 116.4055848181818]","[1020, 1021, 1022, 1023, 1029, 1030, 1031, 103...",f08t12,Saturday
1,"[39.9097909090909, 116.40469272727273]","[1037, 1038, 1039, 1040, 1041, 1042, 1043, 104...",f12t16,Saturday
2,"[39.89846146153847, 116.38943146153848]","[1075, 1076, 1077, 1078, 1079, 1080, 1081, 108...",f12t16,Saturday
3,"[39.98576385, 116.41272174999999]","[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...",f08t12,Friday
4,"[40.00947544444444, 116.31560994444447]","[479, 486, 488, 493, 495, 496, 497, 498, 500, ...",f16t20,Friday
5,"[40.00976641860466, 116.31521109302322]","[318, 337, 350, 351, 352, 353, 354, 355, 356, ...",f00t04,Saturday
6,"[40.00972907407408, 116.31529355555557]","[217, 243, 254, 326, 328, 339, 347, 357, 358, ...",f12t16,Friday
7,"[40.0098601, 116.31529659999997]","[44, 46, 313, 314, 331, 336, 343, 344, 364, 36...",f12t16,Friday
8,"[40.00996627272727, 116.31515850000005]","[38, 45, 309, 330, 342, 370, 388, 534, 692, 72...",f00t04,Saturday
9,"[40.010062999999995, 116.31538221428572]","[220, 221, 307, 308, 334, 335, 708, 720, 722, ...",f00t04,Friday


In [42]:
df1

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,40.001520,116.311859,2009-04-03 01:16:00,f00t04,Friday,-1
1,40.001437,116.312250,2009-04-03 01:17:00,f00t04,Friday,-1
2,40.002051,116.312630,2009-04-03 01:18:00,f00t04,Friday,-1
3,40.004016,116.312476,2009-04-03 01:19:00,f00t04,Friday,-1
4,40.004578,116.314774,2009-04-03 01:20:00,f00t04,Friday,-1
...,...,...,...,...,...,...
1166,40.009642,116.315100,2009-04-04 14:31:00,f12t16,Saturday,11
1167,40.009683,116.315028,2009-04-04 14:32:00,f12t16,Saturday,8
1168,40.009551,116.315061,2009-04-04 14:33:00,f12t16,Saturday,11
1169,40.009591,116.315148,2009-04-04 14:34:00,f12t16,Saturday,11


In [43]:
data_df_1

Unnamed: 0,centers,clusters
0,"[40.009822434782606, 116.31537617391304]","[314, 331, 332, 343, 344, 345, 346, 347, 359, ..."
1,"[40.00977977500001, 116.31522249999998]","[44, 313, 318, 336, 352, 353, 355, 356, 369, 3..."
2,"[40.00972261538462, 116.31529069230768]","[217, 243, 254, 326, 328, 339, 357, 358, 362, ..."
3,"[40.00985584000001, 116.31510704]","[47, 250, 387, 415, 462, 534, 574, 592, 639, 6..."
4,"[40.010041384615384, 116.31493176923075]","[36, 37, 48, 49, 213, 214, 215, 310, 606, 735,..."
5,"[40.009942578947374, 116.3152110526316]","[38, 45, 46, 309, 330, 342, 370, 413, 692, 699..."
6,"[40.010017999999995, 116.31542119999999]","[307, 308, 334, 335, 720, 822, 823, 824, 825, ..."
7,"[40.00968718181818, 116.315785]","[211, 304, 333, 338, 477, 478, 479, 487, 489, ..."
8,"[40.00968004166666, 116.3150474166666]","[218, 238, 322, 323, 324, 368, 378, 379, 381, ..."
9,"[40.009747591836735, 116.31510653061221]","[242, 247, 248, 317, 341, 382, 398, 428, 455, ..."


In [44]:
df1

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,40.001520,116.311859,2009-04-03 01:16:00,f00t04,Friday,-1
1,40.001437,116.312250,2009-04-03 01:17:00,f00t04,Friday,-1
2,40.002051,116.312630,2009-04-03 01:18:00,f00t04,Friday,-1
3,40.004016,116.312476,2009-04-03 01:19:00,f00t04,Friday,-1
4,40.004578,116.314774,2009-04-03 01:20:00,f00t04,Friday,-1
...,...,...,...,...,...,...
1166,40.009642,116.315100,2009-04-04 14:31:00,f12t16,Saturday,11
1167,40.009683,116.315028,2009-04-04 14:32:00,f12t16,Saturday,8
1168,40.009551,116.315061,2009-04-04 14:33:00,f12t16,Saturday,11
1169,40.009591,116.315148,2009-04-04 14:34:00,f12t16,Saturday,11


In [45]:
df1['clusterNo'] = -1
for i in range(len(data_df_1)):
    for j in range(len(data_df_1['clusters'].iloc[i])):
        k = data_df_1['clusters'].iloc[i][j]
        df1['clusterNo'].iloc[k] = i


df1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice fro

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice fro

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,40.001520,116.311859,2009-04-03 01:16:00,f00t04,Friday,-1
1,40.001437,116.312250,2009-04-03 01:17:00,f00t04,Friday,-1
2,40.002051,116.312630,2009-04-03 01:18:00,f00t04,Friday,-1
3,40.004016,116.312476,2009-04-03 01:19:00,f00t04,Friday,-1
4,40.004578,116.314774,2009-04-03 01:20:00,f00t04,Friday,-1
...,...,...,...,...,...,...
1166,40.009642,116.315100,2009-04-04 14:31:00,f12t16,Saturday,11
1167,40.009683,116.315028,2009-04-04 14:32:00,f12t16,Saturday,8
1168,40.009551,116.315061,2009-04-04 14:33:00,f12t16,Saturday,11
1169,40.009591,116.315148,2009-04-04 14:34:00,f12t16,Saturday,11


In [46]:
file_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20090120002837.txt'
df2 = fileReader(file_name)
data_df_2 = gmeansFunc(df2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a 

In [47]:
df2['clusterNo'] = -1
for i in range(len(data_df_2)):
    for j in range(len(data_df_2['clusters'].iloc[i])):
        k = data_df_2['clusters'].iloc[i][j]
        df2['clusterNo'].iloc[k] = i


df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice fro

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,39.999840,116.327191,2009-01-20 00:28:00,f00t04,Tuesday,-1
1,39.999991,116.326629,2009-01-20 00:29:00,f00t04,Tuesday,-1
2,40.001186,116.325686,2009-01-20 00:30:00,f00t04,Tuesday,-1
3,40.002164,116.324517,2009-01-20 00:31:00,f00t04,Tuesday,-1
4,40.003486,116.323199,2009-01-20 00:32:00,f00t04,Tuesday,-1
...,...,...,...,...,...,...
939,32.057004,121.270890,2009-01-21 02:11:00,f00t04,Wednesday,-1
940,32.057173,121.269710,2009-01-21 02:16:00,f00t04,Wednesday,-1
941,32.056679,121.271410,2009-01-21 02:17:00,f00t04,Wednesday,-1
942,32.056826,121.271292,2009-01-21 02:18:00,f00t04,Wednesday,-1


In [48]:
file_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20090221034838.txt'
df3 = fileReader(file_name)
data_df_3 = gmeansFunc(df3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a 

In [49]:
df3['clusterNo'] = -1
for i in range(len(data_df_3)):
    for j in range(len(data_df_3['clusters'].iloc[i])):
        k = data_df_3['clusters'].iloc[i][j]
        df3['clusterNo'].iloc[k] = i


df3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice fro

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,39.999168,116.334129,2009-02-21 03:48:00,f00t04,Saturday,-1
1,39.999380,116.334245,2009-02-21 03:49:00,f00t04,Saturday,-1
2,40.000569,116.333664,2009-02-21 03:50:00,f00t04,Saturday,-1
3,40.000496,116.333412,2009-02-21 03:51:00,f00t04,Saturday,-1
4,40.000529,116.332941,2009-02-21 03:52:00,f00t04,Saturday,-1
...,...,...,...,...,...,...
711,39.996333,116.338616,2009-02-21 22:52:00,f20t24,Saturday,-1
712,39.996465,116.338802,2009-02-21 22:53:00,f20t24,Saturday,-1
713,39.996531,116.338755,2009-02-21 22:54:00,f20t24,Saturday,-1
714,39.997935,116.338913,2009-02-21 22:55:00,f20t24,Saturday,2


In [50]:
file_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20090214045230.txt'
df4 = fileReader(file_name)
data_df_4 = gmeansFunc(df4)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a 

In [51]:
df4['clusterNo'] = -1
for i in range(len(data_df_4)):
    for j in range(len(data_df_4['clusters'].iloc[i])):
        k = data_df_4['clusters'].iloc[i][j]
        df4['clusterNo'].iloc[k] = i


df4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice fro

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,39.998569,116.339200,2009-02-14 04:52:00,f04t08,Saturday,21
1,39.998634,116.338622,2009-02-14 04:53:00,f04t08,Saturday,13
2,39.998102,116.337258,2009-02-14 04:54:00,f04t08,Saturday,-1
3,39.997688,116.334919,2009-02-14 04:55:00,f04t08,Saturday,-1
4,39.997469,116.333972,2009-02-14 04:56:00,f04t08,Saturday,-1
...,...,...,...,...,...,...
755,39.999533,116.338270,2009-02-14 23:35:00,f20t24,Saturday,5
756,39.998664,116.338281,2009-02-14 23:36:00,f20t24,Saturday,15
757,39.998998,116.337676,2009-02-14 23:37:00,f20t24,Saturday,-1
758,39.999212,116.338239,2009-02-14 23:38:00,f20t24,Saturday,6


In [52]:
file_name = r'C:\Users\sk002\OneDrive\바탕 화면\학교\Yoodori\Geolife Trajectories 1.3\Data\003\Trajectory\20081202160051.txt'
df5 = fileReader(file_name)
data_df_5 = gmeansFunc(df5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a 

In [53]:
df5['clusterNo'] = -1
for i in range(len(data_df_5)):
    for j in range(len(data_df_5['clusters'].iloc[i])):
        k = data_df_5['clusters'].iloc[i][j]
        df5['clusterNo'].iloc[k] = i


df5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['clusterNo'].iloc[k] = i
A value is trying to be set on a copy of a slice fro

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,39.999869,116.326993,2008-12-02 16:00:00,f16t20,Tuesday,17
1,39.999895,116.327039,2008-12-02 16:01:00,f16t20,Tuesday,17
2,39.999880,116.325793,2008-12-02 16:02:00,f16t20,Tuesday,17
3,40.000132,116.324713,2008-12-02 16:03:00,f16t20,Tuesday,-1
4,40.001176,116.324511,2008-12-02 16:04:00,f16t20,Tuesday,-1
...,...,...,...,...,...,...
680,39.996573,116.326611,2008-12-03 11:30:00,f08t12,Wednesday,-1
681,39.998787,116.326686,2008-12-03 11:31:00,f08t12,Wednesday,17
682,39.999864,116.327365,2008-12-03 11:32:00,f08t12,Wednesday,17
683,39.999924,116.327458,2008-12-03 11:33:00,f08t12,Wednesday,17


In [54]:
data_df_1

Unnamed: 0,centers,clusters
0,"[40.009822434782606, 116.31537617391304]","[314, 331, 332, 343, 344, 345, 346, 347, 359, ..."
1,"[40.00977977500001, 116.31522249999998]","[44, 313, 318, 336, 352, 353, 355, 356, 369, 3..."
2,"[40.00972261538462, 116.31529069230768]","[217, 243, 254, 326, 328, 339, 357, 358, 362, ..."
3,"[40.00985584000001, 116.31510704]","[47, 250, 387, 415, 462, 534, 574, 592, 639, 6..."
4,"[40.010041384615384, 116.31493176923075]","[36, 37, 48, 49, 213, 214, 215, 310, 606, 735,..."
5,"[40.009942578947374, 116.3152110526316]","[38, 45, 46, 309, 330, 342, 370, 413, 692, 699..."
6,"[40.010017999999995, 116.31542119999999]","[307, 308, 334, 335, 720, 822, 823, 824, 825, ..."
7,"[40.00968718181818, 116.315785]","[211, 304, 333, 338, 477, 478, 479, 487, 489, ..."
8,"[40.00968004166666, 116.3150474166666]","[218, 238, 322, 323, 324, 368, 378, 379, 381, ..."
9,"[40.009747591836735, 116.31510653061221]","[242, 247, 248, 317, 341, 382, 398, 428, 455, ..."


In [55]:
data_df_2

Unnamed: 0,centers,clusters
0,"[32.0380186, 120.82664150000001]","[847, 848, 849, 850, 873, 874, 875, 876, 877, ..."
1,"[32.044922299999996, 120.8895796]","[887, 888, 889, 890, 891, 892, 893, 894, 895, ..."
2,"[32.07005584210527, 120.85438831578946]","[818, 819, 820, 821, 822, 823, 824, 825, 826, ..."
3,"[33.55559185714285, 119.70179814285713]","[685, 686, 687, 688, 689, 690, 691, 692, 693, ..."
4,"[34.1647111904762, 118.70941452380951]","[600, 601, 602, 603, 604, 605, 606, 607, 608, ..."
5,"[34.35860088000001, 118.38388452000001]","[575, 576, 577, 578, 579, 580, 581, 582, 583, ..."
6,"[33.6336987037037, 119.04485937037038]","[641, 642, 643, 644, 645, 646, 647, 648, 649, ..."
7,"[33.886386949999995, 118.88300544999997]","[621, 622, 623, 624, 625, 626, 627, 628, 629, ..."
8,"[33.54338811764706, 119.3318145882353]","[668, 669, 670, 671, 672, 673, 674, 675, 676, ..."
9,"[34.2645017, 117.20207580000002]","[509, 510, 511, 512, 513, 514, 515, 516, 517, ..."


In [56]:
data_df_3

Unnamed: 0,centers,clusters
0,"[40.00514978947369, 116.32449673684208]","[24, 25, 26, 50, 51, 52, 53, 54, 55, 56, 57, 5..."
1,"[40.00884143478261, 116.32202982608699]","[27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 3..."
2,"[39.997750538461545, 116.3385786923077]","[677, 678, 679, 680, 681, 682, 683, 684, 685, ..."
3,"[39.99150663636363, 116.33240863636365]","[92, 93, 94, 95, 96, 97, 98, 99, 308, 316, 317]"
4,"[39.99914724999999, 116.33848702777777]","[360, 372, 384, 390, 391, 393, 407, 409, 410, ..."
5,"[39.99928914285714, 116.33854222857143]","[404, 415, 416, 419, 441, 442, 500, 503, 505, ..."
6,"[39.99904251851851, 116.33844137037039]","[338, 342, 348, 352, 355, 359, 361, 373, 374, ..."
7,"[39.998980714285715, 116.33850650000002]","[336, 339, 351, 357, 375, 402, 414, 480, 529, ..."
8,"[39.9989069, 116.33842784999997]","[341, 345, 363, 378, 385, 388, 396, 481, 504, ..."
9,"[39.99906608333333, 116.33864058333332]","[376, 484, 485, 494, 532, 533, 534, 537, 540, ..."


In [57]:
data_df_4

Unnamed: 0,centers,clusters
0,"[39.99105290000001, 116.4462241]","[56, 57, 58, 59, 60, 61, 62, 63, 64, 65]"
1,"[39.99391791666667, 116.44537925000002]","[25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 3..."
2,"[39.99234175, 116.4467025]","[51, 52, 53, 54, 55, 66, 67, 68, 69, 82, 83, 84]"
3,"[39.999380020000004, 116.33852741999999]","[237, 264, 277, 278, 356, 362, 367, 368, 369, ..."
4,"[39.99978657894737, 116.33869163157894]","[218, 247, 248, 249, 263, 270, 275, 496, 502, ..."
5,"[39.999583416666674, 116.33842213888887]","[186, 203, 219, 226, 235, 236, 258, 357, 358, ..."
6,"[39.99935463636364, 116.33815545454546]","[392, 412, 413, 415, 591, 594, 626, 714, 731, ..."
7,"[39.999845500000006, 116.33828465000002]","[207, 227, 234, 365, 366, 370, 371, 373, 375, ..."
8,"[40.000368066666674, 116.33859853333335]","[204, 215, 216, 217, 257, 431, 433, 592, 602, ..."
9,"[40.000900875, 116.33865387500002]","[232, 233, 434, 435, 436, 437, 438, 439, 440, ..."


In [58]:
data_df_5

Unnamed: 0,centers,clusters
0,"[39.98266413043478, 116.3631911304348]","[564, 565, 566, 567, 568, 569, 570, 571, 572, ..."
1,"[39.991809149999995, 116.32640355000001]","[383, 384, 513, 527, 528, 529, 530, 533, 534, ..."
2,"[40.01148578124999, 116.321582]","[97, 104, 105, 119, 121, 128, 137, 142, 205, 2..."
3,"[40.01139318181817, 116.32169872727272]","[80, 100, 103, 113, 115, 116, 296, 297, 298, 2..."
4,"[40.01115827272728, 116.3214548181818]","[76, 117, 118, 126, 127, 131, 153, 155, 156, 2..."
5,"[40.01148286363636, 116.32132849999998]","[98, 99, 125, 148, 149, 152, 212, 214, 247, 28..."
6,"[40.012239541666666, 116.32156995833337]","[29, 31, 34, 35, 36, 53, 54, 92, 94, 132, 143,..."
7,"[40.01258736363637, 116.32173318181816]","[26, 27, 67, 71, 81, 87, 88, 90, 91, 173, 240]"
8,"[40.011956043478264, 116.32153434782609]","[28, 30, 32, 40, 69, 70, 139, 161, 162, 167, 1..."
9,"[40.01183040909091, 116.32164645454546]","[43, 46, 47, 48, 49, 50, 52, 61, 63, 78, 79, 9..."


In [59]:
print(f'df1 : {len(df1)} | df2 : {len(df2)} | df3 : {len(df3)} | df4 : {len(df4)} | df5 : {len(df5)}')

df1 : 1171 | df2 : 944 | df3 : 716 | df4 : 760 | df5 : 685


In [60]:
df1 = df1[df1['clusterNo'] != -1]
df2 = df2[df2['clusterNo'] != -1]
df3 = df3[df3['clusterNo'] != -1]
df4 = df4[df4['clusterNo'] != -1]
df5 = df5[df5['clusterNo'] != -1]

In [61]:
df1 = df1.sort_index()
df2 = df2.sort_index()
df3 = df3.sort_index()
df4 = df4.sort_index()
df5 = df5.sort_index()

In [65]:
df1 = df1.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
df3 = df3.reset_index(drop=True)
df4 = df4.reset_index(drop=True)
df5 = df5.reset_index(drop=True)

In [67]:
df5

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
0,39.999869,116.326993,2008-12-02 16:00:00,f16t20,Tuesday,17
1,39.999895,116.327039,2008-12-02 16:01:00,f16t20,Tuesday,17
2,39.999880,116.325793,2008-12-02 16:02:00,f16t20,Tuesday,17
3,40.012765,116.321583,2008-12-02 16:32:00,f16t20,Tuesday,7
4,40.012546,116.321558,2008-12-02 16:33:00,f16t20,Tuesday,7
...,...,...,...,...,...,...
494,39.992033,116.326227,2008-12-03 11:25:00,f08t12,Wednesday,1
495,39.998787,116.326686,2008-12-03 11:31:00,f08t12,Wednesday,17
496,39.999864,116.327365,2008-12-03 11:32:00,f08t12,Wednesday,17
497,39.999924,116.327458,2008-12-03 11:33:00,f08t12,Wednesday,17


In [72]:
df1[df1['clusterNo'] == 34]

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,clusterNo
40,39.985203,116.438678,2009-04-03 08:28:00,f08t12,Friday,34
41,39.980219,116.447448,2009-04-03 08:29:00,f08t12,Friday,34
42,39.977613,116.451382,2009-04-03 08:30:00,f08t12,Friday,34
43,39.975156,116.455269,2009-04-03 08:31:00,f08t12,Friday,34
44,39.972856,116.458854,2009-04-03 08:32:00,f08t12,Friday,34
45,39.970485,116.462266,2009-04-03 08:33:00,f08t12,Friday,34
46,39.969027,116.46263,2009-04-03 08:34:00,f08t12,Friday,34
47,39.975647,116.469753,2009-04-03 08:35:00,f08t12,Friday,34
58,39.982451,116.476951,2009-04-03 09:28:00,f08t12,Friday,34
59,39.974493,116.467919,2009-04-03 09:29:00,f08t12,Friday,34


In [75]:
type(df1['hour_block'][0])

str

In [79]:
df1[df1['clusterNo'] == 34]['hour_block']

40    f08t12
41    f08t12
42    f08t12
43    f08t12
44    f08t12
45    f08t12
46    f08t12
47    f08t12
58    f08t12
59    f08t12
60    f08t12
61    f08t12
62    f08t12
63    f08t12
Name: hour_block, dtype: object

In [80]:
counter = Counter(df1[df1['clusterNo'] == 34]['hour_block'])
most_value = counter.most_common(1)[0][0]
most_value

'f08t12'

In [None]:
for i in range(max(df1['clusterNo'])):
    counter = Counter(df1[df1['clusterNo'] == i]['hour_block'])
    most_value = counter.most_common(1)[0][0]

In [None]:
df.iloc[1037]

In [17]:
data_df.clusters.iloc[0][0]

NameError: name 'data_df' is not defined

In [None]:
data_df

In [None]:
df['latitude'].iloc[1038]

In [None]:
df['latitude'].iloc[0]

In [17]:
data_df_1['labels']

0    -1
1    -1
2    -1
3    -1
4    -1
5    -1
6     0
7     0
8     0
9     0
10    0
11    0
12   -1
13   -1
14   -1
15   -1
16   -1
17   -1
18   -1
19   -1
20   -1
21   -1
22   -1
23   -1
24   -1
25   -1
26   -1
27   -1
28   -1
29   -1
30   -1
31   -1
32   -1
33   -1
Name: labels, dtype: int64

In [24]:
origin_df1

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,labels
0,40.001520,116.311859,2009-04-03 01:16:00,f00t04,Friday,-1
1,40.001437,116.312250,2009-04-03 01:17:00,f00t04,Friday,-1
2,40.002051,116.312630,2009-04-03 01:18:00,f00t04,Friday,-1
3,40.004016,116.312476,2009-04-03 01:19:00,f00t04,Friday,-1
4,40.004578,116.314774,2009-04-03 01:20:00,f00t04,Friday,-1
...,...,...,...,...,...,...
1166,21.000000,21.000000,21,21,21,21
1167,10.000000,10.000000,10,10,10,10
1168,23.000000,23.000000,23,23,23,23
1169,21.000000,21.000000,21,21,21,21


In [21]:
df1

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week
0,40.001520,116.311859,2009-04-03 01:16:00,f00t04,Friday
1,40.001437,116.312250,2009-04-03 01:17:00,f00t04,Friday
2,40.002051,116.312630,2009-04-03 01:18:00,f00t04,Friday
3,40.004016,116.312476,2009-04-03 01:19:00,f00t04,Friday
4,40.004578,116.314774,2009-04-03 01:20:00,f00t04,Friday
...,...,...,...,...,...
1166,40.009642,116.315100,2009-04-04 14:31:00,f12t16,Saturday
1167,40.009683,116.315028,2009-04-04 14:32:00,f12t16,Saturday
1168,40.009551,116.315061,2009-04-04 14:33:00,f12t16,Saturday
1169,40.009591,116.315148,2009-04-04 14:34:00,f12t16,Saturday


In [23]:
origin_df1['labels'] = -1
for cluster_idx, (cluster, center) in enumerate(zip(data_df_1.clusters, data_df_1.centers)):
    for point_idx in cluster:
        origin_df1.iloc[point_idx] = cluster_idx
        
origin_df1

Unnamed: 0,latitude,longitude,datetime,hour_block,day_of_week,labels
0,40.001520,116.311859,2009-04-03 01:16:00,f00t04,Friday,-1
1,40.001437,116.312250,2009-04-03 01:17:00,f00t04,Friday,-1
2,40.002051,116.312630,2009-04-03 01:18:00,f00t04,Friday,-1
3,40.004016,116.312476,2009-04-03 01:19:00,f00t04,Friday,-1
4,40.004578,116.314774,2009-04-03 01:20:00,f00t04,Friday,-1
...,...,...,...,...,...,...
1166,21.000000,21.000000,21,21,21,21
1167,10.000000,10.000000,10,10,10,10
1168,23.000000,23.000000,23,23,23,23
1169,21.000000,21.000000,21,21,21,21


In [19]:
df2['labels'] = -1
for cluster_idx, (cluster, center) in enumerate(zip(data_df_2.clusters, data_df_2.centers)):
    for point_idx in cluster:
        df2['labels'].iloc[point_idx] = cluster_idx
        
df2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['labels'].iloc[point_idx] = clus

Unnamed: 0,latitude,longitude,datetime,labels
0,39.999840,116.327191,2009-01-20 00:28:00,-1
1,39.999991,116.326629,2009-01-20 00:29:00,-1
2,40.001186,116.325686,2009-01-20 00:30:00,-1
3,40.002164,116.324517,2009-01-20 00:31:00,-1
4,40.003486,116.323199,2009-01-20 00:32:00,-1
...,...,...,...,...
939,32.057004,121.270890,2009-01-21 02:11:00,-1
940,32.057173,121.269710,2009-01-21 02:16:00,-1
941,32.056679,121.271410,2009-01-21 02:17:00,-1
942,32.056826,121.271292,2009-01-21 02:18:00,-1


In [20]:
df3['labels'] = -1
for cluster_idx, (cluster, center) in enumerate(zip(data_df_3.clusters, data_df_3.centers)):
    for point_idx in cluster:
        df3['labels'].iloc[point_idx] = cluster_idx
        
df3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['labels'].iloc[point_idx] = clus

Unnamed: 0,latitude,longitude,datetime,labels
0,39.999168,116.334129,2009-02-21 03:48:00,-1
1,39.999380,116.334245,2009-02-21 03:49:00,-1
2,40.000569,116.333664,2009-02-21 03:50:00,-1
3,40.000496,116.333412,2009-02-21 03:51:00,-1
4,40.000529,116.332941,2009-02-21 03:52:00,-1
...,...,...,...,...
711,39.996333,116.338616,2009-02-21 22:52:00,-1
712,39.996465,116.338802,2009-02-21 22:53:00,-1
713,39.996531,116.338755,2009-02-21 22:54:00,-1
714,39.997935,116.338913,2009-02-21 22:55:00,-1


In [21]:
df4['labels'] = -1
for cluster_idx, (cluster, center) in enumerate(zip(data_df_4.clusters, data_df_4.centers)):
    for point_idx in cluster:
        df4['labels'].iloc[point_idx] = cluster_idx
        
df4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['labels'].iloc[point_idx] = clus

Unnamed: 0,latitude,longitude,datetime,labels
0,39.998569,116.339200,2009-02-14 04:52:00,16
1,39.998634,116.338622,2009-02-14 04:53:00,8
2,39.998102,116.337258,2009-02-14 04:54:00,-1
3,39.997688,116.334919,2009-02-14 04:55:00,-1
4,39.997469,116.333972,2009-02-14 04:56:00,-1
...,...,...,...,...
755,39.999533,116.338270,2009-02-14 23:35:00,25
756,39.998664,116.338281,2009-02-14 23:36:00,11
757,39.998998,116.337676,2009-02-14 23:37:00,-1
758,39.999212,116.338239,2009-02-14 23:38:00,22


In [22]:
df5['labels'] = -1
for cluster_idx, (cluster, center) in enumerate(zip(data_df_5.clusters, data_df_5.centers)):
    for point_idx in cluster:
        df5['labels'].iloc[point_idx] = cluster_idx
        
df5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['labels'].iloc[point_idx] = cluster_idx
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['labels'].iloc[point_idx] = clus

Unnamed: 0,latitude,longitude,datetime,labels
0,39.999869,116.326993,2008-12-02 16:00:00,14
1,39.999895,116.327039,2008-12-02 16:01:00,14
2,39.999880,116.325793,2008-12-02 16:02:00,14
3,40.000132,116.324713,2008-12-02 16:03:00,-1
4,40.001176,116.324511,2008-12-02 16:04:00,-1
...,...,...,...,...
680,39.996573,116.326611,2008-12-03 11:30:00,-1
681,39.998787,116.326686,2008-12-03 11:31:00,14
682,39.999864,116.327365,2008-12-03 11:32:00,14
683,39.999924,116.327458,2008-12-03 11:33:00,14


In [23]:
for i in range(-1, max(df1['labels']) + 1):
    print(len(df1[df1['labels'] == i]))


334
20
31
28
42
22
26
24
39
11
34
22
14
25
18
25
12
13
25
26
22
21
19
13
41
12
27
12
13
14
10
13
34
25
15
10
11
13
20
15
10
10


<h2>클러스터링 2번<h2>

In [None]:
import folium

# 예제 데이터 생성 (서울의 위도, 경도)
locations = data_df_1.centers.values.tolist()

# 지도의 중심 좌표 설정
map_center = [sum(x[0] for x in locations) / len(locations), sum(x[1] for x in locations) / len(locations)]

# Folium 맵 생성
mymap = folium.Map(location=map_center, zoom_start=12)

# 위치 데이터를 지도에 마커로 표시
for loc in locations:
    folium.Marker(location=loc, popup='Location').add_to(mymap)

# 지도를 HTML 파일로 저장하거나 주피터 노트북에서 바로 표시
mymap


In [None]:
# 예제 데이터 생성 (서울의 위도, 경도)
locations = data_df_2.centers.values.tolist()

# 지도의 중심 좌표 설정
map_center = [sum(x[0] for x in locations) / len(locations), sum(x[1] for x in locations) / len(locations)]

# Folium 맵 생성
mymap = folium.Map(location=map_center, zoom_start=12)

# 위치 데이터를 지도에 마커로 표시
for loc in locations:
    folium.Marker(location=loc, popup='Location').add_to(mymap)

# 지도를 HTML 파일로 저장하거나 주피터 노트북에서 바로 표시
mymap

In [None]:
# 예제 데이터 생성 (서울의 위도, 경도)
locations = data_df_3.centers.values.tolist()

# 지도의 중심 좌표 설정
map_center = [sum(x[0] for x in locations) / len(locations), sum(x[1] for x in locations) / len(locations)]

# Folium 맵 생성
mymap = folium.Map(location=map_center, zoom_start=12)

# 위치 데이터를 지도에 마커로 표시
for loc in locations:
    folium.Marker(location=loc, popup='Location').add_to(mymap)

# 지도를 HTML 파일로 저장하거나 주피터 노트북에서 바로 표시
mymap

In [None]:
# 예제 데이터 생성 (서울의 위도, 경도)
locations = data_df_4.centers.values.tolist()

# 지도의 중심 좌표 설정
map_center = [sum(x[0] for x in locations) / len(locations), sum(x[1] for x in locations) / len(locations)]

# Folium 맵 생성
mymap = folium.Map(location=map_center, zoom_start=12)

# 위치 데이터를 지도에 마커로 표시
for loc in locations:
    folium.Marker(location=loc, popup='Location').add_to(mymap)

# 지도를 HTML 파일로 저장하거나 주피터 노트북에서 바로 표시
mymap

In [None]:
# 예제 데이터 생성 (서울의 위도, 경도)
locations = data_df_5.centers.values.tolist()

# 지도의 중심 좌표 설정
map_center = [sum(x[0] for x in locations) / len(locations), sum(x[1] for x in locations) / len(locations)]

# Folium 맵 생성
mymap = folium.Map(location=map_center, zoom_start=12)

# 위치 데이터를 지도에 마커로 표시
for loc in locations:
    folium.Marker(location=loc, popup='Location').add_to(mymap)

# 지도를 HTML 파일로 저장하거나 주피터 노트북에서 바로 표시
mymap

<h2>실루엣 계수<h2>

In [None]:
df.iloc[1038]

In [None]:
silhouette_avg = silhouette_score(df1[['latitude', 'longitude']], df1['labels'])
print(f'Silhouette Score: {silhouette_avg}')

silhouette_avg = silhouette_score(df2[['latitude', 'longitude']], df2['labels'])
print(f'Silhouette Score: {silhouette_avg}')

silhouette_avg = silhouette_score(df3[['latitude', 'longitude']], df3['labels'])
print(f'Silhouette Score: {silhouette_avg}')

silhouette_avg = silhouette_score(df4[['latitude', 'longitude']], df4['labels'])
print(f'Silhouette Score: {silhouette_avg}')

silhouette_avg = silhouette_score(df5[['latitude', 'longitude']], df5['labels'])
print(f'Silhouette Score: {silhouette_avg}')

<h2>T-SNE 시각화<h2>

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 X와 레이블 y를 사용한다고 가정
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(df1[['latitude', 'longitude']])

sns.set_palette("deep")
sns.scatterplot(x = X_tsne[:, 0], y = X_tsne[:, 1], hue=df1['labels'])
plt.legend(title='Label')
plt.show()


In [None]:
# 데이터 X와 레이블 y를 사용한다고 가정
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(df2[['latitude', 'longitude']])

sns.set_palette("deep")
sns.scatterplot(x = X_tsne[:, 0], y = X_tsne[:, 1], hue=df2['labels'])
plt.legend(title='Label')
plt.show()

In [None]:
# 데이터 X와 레이블 y를 사용한다고 가정
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(df3[['latitude', 'longitude']])

sns.set_palette("deep")
sns.scatterplot(x = X_tsne[:, 0], y = X_tsne[:, 1], hue=df3['labels'])
plt.legend(title='Label')
plt.show()

In [None]:
# 데이터 X와 레이블 y를 사용한다고 가정
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(df4[['latitude', 'longitude']])

sns.set_palette("deep")
sns.scatterplot(x = X_tsne[:, 0], y = X_tsne[:, 1], hue=df4['labels'])
plt.legend(title='Label')
plt.show()

In [None]:
# 데이터 X와 레이블 y를 사용한다고 가정
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(df5[['latitude', 'longitude']])

sns.set_palette("deep")
sns.scatterplot(x = X_tsne[:, 0], y = X_tsne[:, 1], hue=df5['labels'])
plt.legend(title='Label')
plt.show()