# импорты

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gpxpy
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.stats import shapiro, normaltest
from math import radians, cos, sin, asin, sqrt
import warnings
warnings.filterwarnings('ignore')

# парсинг gpx dataframe

In [None]:
gpx_file = 'track.gpx'
with open(gpx_file, 'r') as f:
    gpx = gpxpy.parse(f)

data = []
for track in gpx.tracks:
    for segment in track.segments:
        for point in segment.points:
            data.append([point.latitude, point.longitude, point.elevation])

df_points = pd.DataFrame(data, columns=['latitude', 'longitude', 'elevation'])


# расчет метрик трека

In [None]:


distances = []
for i in range(1, len(df_points)):
    lon1, lat1 = radians(df_points.iloc[i-1]['longitude']), radians(df_points.iloc[i-1]['latitude'])
    lon2, lat2 = radians(df_points.iloc[i]['longitude']), radians(df_points.iloc[i]['latitude'])
    dlon, dlat = lon2 - lon1, lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    distances.append(6371 * 2 * asin(sqrt(a)) * 1000)

total_distance_m = sum(distances)
metrics = pd.DataFrame([{
    'track_name': gpx.tracks[0].name if gpx.tracks else 'Unnamed',
    'total_distance_km': total_distance_m / 1000,
    'total_distance_m': total_distance_m,
    'elevation_gain': df_points['elevation'].max() - df_points['elevation'].min(),
    'avg_elevation': df_points['elevation'].mean(),
    'points_count': len(df_points)
}])


# db


In [None]:
engine = create_engine('postgresql://user:password@localhost:5432/tracks_db')

metrics.to_sql('tracks', engine, if_exists='replace', index=False)
df_points.to_sql('track_points', engine, if_exists='replace', index=False)

# загрузка доп данных как пример
additional_tracks = pd.DataFrame({
    'track_name': ['Hike A', 'Hike B', 'Hike C'],
    'total_distance_km': [15.2, 8.5, 22.1],
    'total_distance_m': [15200, 8500, 22100],
    'elevation_gain': [1200, 350, 1800],
    'avg_elevation': [800, 200, 1500],
    'points_count': [500, 300, 700]
})

all_metrics = pd.concat([metrics, additional_tracks], ignore_index=True)

#  АНАЛИЗ РАСПРЕДЕЛЕНИЙ ПРИЗНАКОВ

In [None]:
features = ['total_distance_km', 'elevation_gain', 'avg_elevation']
normality_results = {}

for feature in features:
    if feature in all_metrics.columns:
        data = all_metrics[feature].dropna()
        shapiro_stat, shapiro_p = shapiro(data)
        dagostino_stat, dagostino_p = normaltest(data)
        normality_results[feature] = {
            'shapiro_p': shapiro_p,
            'dagostino_p': dagostino_p,
            'is_normal': shapiro_p > 0.05
        }


#  ПОДГОТОВКА ДАННЫХ ДЛЯ КЛАСТЕРИЗАЦИИ

In [None]:
X = all_metrics[features].fillna(0).values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

 # ОПРЕДЕЛЕНИЕ ОПТИМАЛЬНОГО КОЛИЧЕСТВА КЛАСТЕРОВ# 

In [None]:
silhouette_scores = []
for k in range(2, min(10, len(X))):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X_scaled)
    score = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append(score)

optimal_k = range(2, min(10, len(X)))[np.argmax(silhouette_scores)]


# КЛАСТЕРИЗАЦИЯ K-MEANS

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)
all_metrics['cluster'] = cluster_labels

# ОЦЕНКА

In [None]:
silhouette = silhouette_score(X_scaled, cluster_labels)
calinski = calinski_harabasz_score(X_scaled, cluster_labels)
davies = davies_bouldin_score(X_scaled, cluster_labels)

# VISUAL

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Гистограмма распределения дистанции
axes[0, 0].hist(all_metrics['total_distance_km'], bins=10, alpha=0.7)
axes[0, 0].set_xlabel('Дистанция (км)')
axes[0, 0].set_ylabel('Частота')

# Диаграмма рассеяния
scatter = axes[0, 1].scatter(all_metrics['total_distance_km'], 
                             all_metrics['elevation_gain'], 
                             c=cluster_labels, cmap='viridis')
axes[0, 1].set_xlabel('Дистанция (км)')
axes[0, 1].set_ylabel('Набор высоты (м)')
plt.colorbar(scatter, ax=axes[0, 1])

# Boxplot по кластерам
box_data = [all_metrics[all_metrics['cluster'] == i]['total_distance_km'] 
            for i in range(optimal_k)]
axes[1, 0].boxplot(box_data)
axes[1, 0].set_xlabel('Кластер')
axes[1, 0].set_ylabel('Дистанция (км)')

# Метрики качества
metrics_text = f'Silhouette: {silhouette:.3f}\nCalinski: {calinski:.1f}\nDavies: {davies:.3f}'
axes[1, 1].text(0.5, 0.5, metrics_text, ha='center', va='center', fontsize=12)
axes[1, 1].axis('off')

plt.tight_layout()
plt.show()

# СОХРАНЕНИЕ В DB

In [None]:
all_metrics.to_sql('track_clusters', engine, if_exists='replace', index=False)

# создание карты?

In [None]:
import folium
if not df_points.empty:
    center_lat, center_lon = df_points['latitude'].mean(), df_points['longitude'].mean()
    m = folium.Map(location=[center_lat, center_lon], zoom_start=12)
    
    points = list(zip(df_points['latitude'], df_points['longitude']))
    folium.PolyLine(points, color='blue', weight=2.5, opacity=1).add_to(m)
    
    m.save('track_map.html')