<a href="https://colab.research.google.com/github/yuancx2025/526_project_group2/blob/main/Data_science.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Input the ticket level data

In [None]:
!pip install pyreadstat

Collecting pyreadstat
  Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.2 kB)
Downloading pyreadstat-1.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (666 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m666.4/666.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.3.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/KG_cost.dta"

MessageError: Error: credential propagation was unsuccessful

Input graph level data (help visualize)

In [None]:
import pandas as pd

airports_raw = pd.read_csv(
    "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat",
    header=None
)


airports = airports_raw[[1, 2, 3, 4, 5, 6, 7, 8, 9]]
airports.columns = ["airport_name", "city", "country", "IATA", "ICAO", "lat", "lon", "alt", "timezone"]


airports = airports[airports["IATA"].notna()]
airports.head()

Combine tow data.

In [None]:

data = data.merge(
    airports[["IATA", "lat", "lon", "country", "city"]],
    how="left",
    left_on="origin",
    right_on="IATA"
).rename(columns={
    "lat": "origin_lat",
    "lon": "origin_lon",
    "country": "origin_country",
    "city": "origin_city"
})

In [None]:
# Merge destination coordinates
data = data.merge(
    airports[["IATA", "lat", "lon", "country", "city"]],
    how="left",
    left_on="dest",
    right_on="IATA"
).rename(columns={
    "lat": "dest_lat",
    "lon": "dest_lon",
    "country": "dest_country",
    "city": "dest_city"
})

Check merge level and shrink into US ariline market

In [None]:
data[["origin", "origin_lat", "origin_lon", "dest", "dest_lat", "dest_lon"]].head()
data["origin_lat"].notna().mean(), data["dest_lat"].notna().mean()
data_us = data[(data["origin_country"]=="United States") &
               (data["dest_country"]=="United States")]

In [None]:
routes_2005 = (data_us[data_us["year"]==2005]
               .groupby(["origin","dest"], as_index=False)
               .agg({"origin_lat":"first","origin_lon":"first",
                     "dest_lat":"first","dest_lon":"first",
                     "HHI_route":"mean"}))
routes_2005["market_type"] = pd.cut(
    routes_2005["HHI_route"],
    bins=[-float("inf"), 0.2, 0.6, float("inf")],
    labels=["competitive", "oligopoly", "monopoly"]
)

Graph the

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=routes_2005,
              x="market_type",
              order=["competitive","oligopoly","monopoly"],
              palette={"competitive":"#1b9e77",
                       "oligopoly":"#d95f02",
                       "monopoly":"#7570b3"})
plt.title("Number of Routes by Market Type (2005)")
plt.xlabel("Market Type")
plt.ylabel("Number of Routes")
plt.show()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import geodatasets

# --- US basemap ---
world = gpd.read_file(geodatasets.get_path("naturalearth.land"))
usa = world.cx[-130:-65, 24:50]

# --- choose a small sample to reduce clutter ---
routes_sample = routes_2005.sample(frac=0.15, random_state=42)

# --- (re‑)build geometry safely ---
from shapely.geometry import LineString
routes_sample = routes_sample.dropna(subset=["origin_lon","origin_lat","dest_lon","dest_lat"])
routes_sample["geometry"] = [
    LineString([(r.origin_lon, r.origin_lat), (r.dest_lon, r.dest_lat)])
    for _, r in routes_sample.iterrows()
]
routes_gdf = gpd.GeoDataFrame(routes_sample, geometry="geometry", crs="EPSG:4326")

color_map = {
    "competitive": "#1b9e77",
    "oligopoly": "#d95f02",
    "monopoly": "#7570b3"
}

fig, ax = plt.subplots(figsize=(10, 6))
usa.plot(ax=ax, color="lightgray", edgecolor="white")

# --- plot each market type if it exists ---
for label, color in color_map.items():
    subset = routes_gdf[routes_gdf["market_type"] == label]
    if len(subset) > 0 and subset.geometry.notna().any():
        subset = subset[subset.geometry.is_valid]
        subset.plot(ax=ax, color=color, linewidth=0.4,
                    alpha=0.4, label=label)

# --- decorations ---
plt.legend(title="Market Type", loc="lower left", frameon=True)
plt.title("U.S. Airline Routes by Market Type (2005)", fontsize=14, weight="bold")
plt.xlim(-130, -65)
plt.ylim(24, 50)
plt.axis("off")
plt.tight_layout()
plt.show()