In [None]:
from pyspark.sql.functions import col, sum, mean, stddev, min, max, count, dayofweek, hour
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Spark SQL を使用して、現在の既定のレイクハウスと同じワークスペースからのレイクハウスに対してクエリを実行してください。
df = spark.sql("SELECT * FROM Gold_LH_NYC.fact_trip")
dim_weekday = spark.sql("SELECT * FROM Gold_LH_NYC.dim_weekday")
dim_paymenttype = spark.sql("SELECT * FROM Gold_LH_NYC.dim_paymenttype")
dim_location = spark.sql("SELECT * FROM Gold_LH_NYC.dim_location")

In [None]:
# 月ごとに乗客者数を集計
monthly_passenger_count = df.groupBy("pickup_year", "pickup_month") \
    .agg(sum("passenger_count").alias("total_passenger_count")) \
    .orderBy("pickup_year", "pickup_month")

# Pandasデータフレームに変換
monthly_passenger_count_pd = monthly_passenger_count.toPandas()
# pickup_yearとpickup_monthを文字列として結合して新しい列を作成
monthly_passenger_count_pd['year_month'] = monthly_passenger_count_pd['pickup_year'].astype(str) + '-' + monthly_passenger_count_pd['pickup_month'].astype(str).str.zfill(2)
# 年月順にソート
monthly_passenger_count_pd = monthly_passenger_count_pd.sort_values('year_month')
#結果を表で表示
display(monthly_passenger_count_pd)

In [None]:
# 可視化
plt.figure(figsize=(12, 6))
plt.plot(monthly_passenger_count_pd['year_month'], monthly_passenger_count_pd['total_passenger_count'], marker='o')
plt.xticks(rotation=90)
plt.xlabel('Year-Month')
plt.ylabel('Total Passenger Count')
plt.title('Monthly Passenger Count')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# 時間ごとに乗客者数を集計
hourly_passenger_count = df.groupBy("pickup_hour") \
    .agg(sum("passenger_count").alias("total_passenger_count")) \
    .orderBy("pickup_hour")
# Pandasデータフレームに変換
hourly_passenger_count_pd = hourly_passenger_count.toPandas()
# カラフルに可視化
plt.figure(figsize=(12, 6))
plt.bar(hourly_passenger_count_pd['pickup_hour'], hourly_passenger_count_pd['total_passenger_count'], color=plt.cm.rainbow(hourly_passenger_count_pd['pickup_hour'] / hourly_passenger_count_pd['pickup_hour'].max()))
plt.xticks(hourly_passenger_count_pd['pickup_hour'])
plt.xlabel('Hour of Day')
plt.ylabel('Total Passenger Count')
plt.title('Hourly Passenger Count')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# 曜日ごとに乗客者数を集計
weekday_passenger_count = df.groupBy("pickup_weekday") \
    .agg(sum("passenger_count").alias("total_passenger_count")) \
    .orderBy("pickup_weekday")

# マッピングテーブルと結合
weekday_passenger_count = weekday_passenger_count.join(dim_weekday, on="pickup_weekday")

# 結果の表示
display(weekday_passenger_count)

In [None]:
# Pandasデータフレームに変換
weekday_passenger_count_pd = weekday_passenger_count.toPandas()
# 曜日順に並び替え
weekday_passenger_count_pd = weekday_passenger_count_pd.sort_values('pickup_weekday')

# カラーマップを使用して濃淡を設定
norm = plt.Normalize(weekday_passenger_count_pd['total_passenger_count'].min(), weekday_passenger_count_pd['total_passenger_count'].max())
colors = plt.cm.viridis(norm(weekday_passenger_count_pd['total_passenger_count']))
# 可視化
fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(weekday_passenger_count_pd['description'], weekday_passenger_count_pd['total_passenger_count'], color=colors)
plt.xlabel('Day of Week')
plt.ylabel('Total Passenger Count')
plt.title('Passenger Count by Day of Week')
plt.grid(True)
plt.tight_layout()

# カラーバーの追加
sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm)
sm.set_array([])
fig.colorbar(sm, ax=ax, label='Passenger Count')

In [None]:
# 基本的な統計情報の取得
total_amount_stats = df.select(
    mean("total_amount").alias("mean"),
    stddev("total_amount").alias("stddev"),
    min("total_amount").alias("min"),
    max("total_amount").alias("max"),
    count("total_amount").alias("count")
).collect()

# 結果の取得
mean_value = total_amount_stats[0]['mean']
stddev_value = total_amount_stats[0]['stddev']
min_value = total_amount_stats[0]['min']
max_value = total_amount_stats[0]['max']
count_value = total_amount_stats[0]['count']

# Pandasデータフレームに変換
total_amount_pd = df.select("total_amount").toPandas()

# ヒストグラムを作成
plt.figure(figsize=(10, 6))
plt.hist(total_amount_pd['total_amount'], bins=50, color='blue', edgecolor='black', alpha=0.7)

# 統計情報の表示
plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1, label=f'Mean: {mean_value:.2f}')

# テキストの追加
plt.text(mean_value, plt.ylim()[1]*0.8, f'Mean: {mean_value:.2f}', color='r', ha='center', fontsize=16)

# グラフのラベルとタイトル
plt.xlabel('Total Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Total Amount')
plt.legend()
plt.grid(True)
plt.tight_layout()

In [None]:
# マイルからメートルへの変換
df_m = df.withColumn("trip_distance_meters", col("trip_distance") * 1609.34)

# 基本的な統計情報の取得
trip_distance_stats = df_m.select(
    mean("trip_distance_meters").alias("mean")
).collect()

# 結果の取得
mean_value = trip_distance_stats[0]['mean']

# 必要なカラムを選択してPandasデータフレームに変換
trip_distance_pd = df_m.select("trip_distance_meters").toPandas()

# ヒストグラムを作成
plt.figure(figsize=(10, 6))
plt.hist(trip_distance_pd['trip_distance_meters'], bins=50, color='silver', edgecolor='black', alpha=0.7)

# 平均値の表示
plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1, label=f'Mean: {mean_value:.2f} meters')

# テキストの追加（位置調整とフォントサイズ変更）
plt.text(mean_value, plt.ylim()[1]*0.9, f'Mean: {mean_value:.2f} meters', color='r', ha='center', fontsize=12)

# グラフのラベルとタイトル
plt.xlabel('Trip Distance (meters)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Distribution of Trip Distance (meters)', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
# 基本的な統計情報の取得
trip_duration_stats = df.select(
    mean("trip_duration").alias("mean")
).collect()

# 結果の取得
mean_duration = trip_duration_stats[0]['mean']

# 必要なカラムを選択してPandasデータフレームに変換
trip_duration_pd = df.select("trip_duration").toPandas()

# ヒストグラムを作成
plt.figure(figsize=(10, 6))
plt.hist(trip_duration_pd['trip_duration'], bins=50, color='green', edgecolor='black', alpha=0.7)

# 平均値の表示
plt.axvline(mean_duration, color='r', linestyle='dashed', linewidth=1, label=f'Mean: {mean_duration:.2f} minutes')

# テキストの追加（位置調整とフォントサイズ変更）
plt.text(mean_duration, plt.ylim()[1]*0.9, f'Mean: {mean_duration:.2f} minutes', color='r', ha='center', fontsize=12)

# グラフのラベルとタイトル
plt.xlabel('Trip Duration (minutes)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Distribution of Trip Duration', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
# 必要なカラムを選択してPandasデータフレームに変換
df_pd = df.select("trip_duration", "total_amount").toPandas()

# 散布図を作成
plt.figure(figsize=(10, 6))
plt.scatter(df_pd['trip_duration'], df_pd['total_amount'], alpha=0.5, edgecolor='k')

# グラフのラベルとタイトル
plt.xlabel('Trip Duration (minutes)', fontsize=14)
plt.ylabel('Total Amount ($)', fontsize=14)
plt.title('Relationship between Trip Duration and Total Amount', fontsize=16)
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
# `payment_type` の分布を集計し、ディメンションテーブルと結合
payment_type_distribution = df.groupBy("payment_type").agg(count("payment_type").alias("count"))
payment_type_distribution = payment_type_distribution.join(dim_paymenttype, on="payment_type").orderBy(col("count").desc())

# Pandasデータフレームに変換
payment_type_pd = payment_type_distribution.toPandas()


# ヒストグラムを作成
plt.figure(figsize=(10, 6))
bars = plt.bar(payment_type_pd['description'], payment_type_pd['count'], color='blue', edgecolor='black', alpha=0.7)

# 各バーの上に数を表示
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval, int(yval), va='bottom', ha='center', fontsize=12, color='black')

# グラフのラベルとタイトル
plt.xlabel('Payment Type', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Distribution of Payment Type', fontsize=16)
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
# 必要なカラムを選択してPandasデータフレームに変換
passenger_count_pd = df.select("passenger_count").toPandas()

# ヒストグラムを作成
plt.figure(figsize=(10, 6))
plt.hist(passenger_count_pd['passenger_count'], bins=range(1, 11), color='blue', edgecolor='black', alpha=0.7, align='left')

# グラフのラベルとタイトル
plt.xlabel('Passenger Count', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Distribution of Passenger Count', fontsize=16)
plt.grid(True)
plt.tight_layout()

plt.show()

In [None]:
# pickup_LocationIDの分布を集計し、ディメンジョンテーブルと結合
pickup_location_distribution = df.groupBy("pickup_LocationID").agg(count("pickup_LocationID").alias("count")).orderBy(col("count").desc()).limit(5)
pickup_location_distribution = pickup_location_distribution.join(dim_location, pickup_location_distribution["pickup_LocationID"] == dim_location["LocationID"]).select("pickup_LocationID", "count", "Borough", "Zone", "service_zone").orderBy(col("count").desc())

# dropoff_LocationIDの分布を集計し、ディメンジョンテーブルと結合
dropoff_location_distribution = df.groupBy("dropoff_LocationID").agg(count("dropoff_LocationID").alias("count")).orderBy(col("count").desc()).limit(5)
dropoff_location_distribution = dropoff_location_distribution.join(dim_location, dropoff_location_distribution["dropoff_LocationID"] == dim_location["LocationID"]).select("dropoff_LocationID", "count", "Borough", "Zone", "service_zone").orderBy(col("count").desc())

# Pandasデータフレームに変換
pickup_location_pd = pickup_location_distribution.toPandas()
dropoff_location_pd = dropoff_location_distribution.toPandas()

# pickup_LocationIDのヒストグラムを作成
plt.figure(figsize=(10, 6))
plt.bar(pickup_location_pd['Zone'], pickup_location_pd['count'], color='blue', edgecolor='black', alpha=0.7)
plt.xlabel('Pickup Zone', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Top 5 Pickup Locations', fontsize=16)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# dropoff_LocationIDのヒストグラムを作成
plt.figure(figsize=(10, 6))
plt.bar(dropoff_location_pd['Zone'], dropoff_location_pd['count'], color='green', edgecolor='black', alpha=0.7)
plt.xlabel('Dropoff Zone', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Top 5 Dropoff Locations', fontsize=16)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()