In [30]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# загрузка данных
data = pd.read_csv("flights.csv", index_col=0)

# выбор параметров для кластеризации
X = data[['duration', 'distance', 'price']]

# кластеризация данных
number_of_clusters = 2
kmeans = KMeans(n_clusters=number_of_clusters, random_state=0, n_init='auto').fit(X)

# добавление меток кластеров в исходный датасет
data['cluster'] = kmeans.labels_

# анализ несоответствий цен в каждом кластере
for i in range(number_of_clusters):
    cluster = data[data['cluster']==i]
    min_cluster_duration = cluster['duration'].min()
    max_cluster_duration = cluster['duration'].max()
    min_cluster_distance = cluster['distance'].min()
    max_cluster_distance = cluster['distance'].max()
    min_cluster_price = cluster['price'].min()
    max_cluster_price = cluster['price'].max()
    number_of_values = cluster.shape[0]
    median = cluster['price'].median()
    std_price = cluster['price'].std()    
    Q1 = cluster['price'].quantile(0.25)
    Q3 = cluster['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outlier_prices = cluster[(cluster['price']<lower_bound) | (cluster['price']>upper_bound)]
    
    print(f"Cluster {i}")
    print(f"\tprice range: {min_cluster_price} - {max_cluster_price}")
    print(f"\tduration range: {min_cluster_duration} - {max_cluster_duration}")
    print(f"\tdistance range: {min_cluster_distance} - {max_cluster_distance}")
    print(f"\tnumber of values: {number_of_values}, Q1 = {Q1:.2f}, median = {median}, Q3 = {Q3:.2f}")
    print(f"\tLower bound = {lower_bound:.2f}, Upper bound = {upper_bound:.2f}")
    print(f"\tOutlier prices: {outlier_prices.shape[0]}: {[index for index in outlier_prices.index]}\n")
    


Cluster 0
	price range: 13 - 453
	duration range: 120 - 700
	distance range: 1810 - 8787
	number of values: 2220, Q1 = 41.00, median = 62.0, Q3 = 94.00
	Lower bound = -38.50, Upper bound = 173.50
	Outlier prices: 88: [47, 62, 255, 307, 308, 346, 409, 430, 457, 692, 738, 794, 875, 1293, 1492, 1502, 1525, 1815, 1895, 1899, 1952, 2192, 2193, 2194, 2529, 2600, 2685, 2732, 2851, 2853, 2885, 2929, 2980, 3060, 3098, 3126, 3134, 3162, 3438, 3601, 3794, 3796, 3840, 3905, 4084, 4124, 4192, 4272, 4577, 4631, 4670, 4833, 4866, 4897, 4917, 4942, 5058, 5164, 5215, 5295, 5304, 5312, 5461, 5497, 5642, 5672, 5688, 5770, 5796, 5847, 5959, 6005, 6117, 6174, 6207, 6219, 6273, 6377, 6665, 6703, 6792, 6826, 6871, 6887, 6943, 7030, 7067, 7193]

Cluster 1
	price range: 6 - 260
	duration range: 30 - 205
	distance range: 61 - 1810
	number of values: 5048, Q1 = 30.00, median = 45.0, Q3 = 69.00
	Lower bound = -28.50, Upper bound = 127.50
	Outlier prices: 84: [100, 298, 498, 674, 729, 820, 872, 1162, 1176, 1367, 1