In [1]:
import os
print(os.popen("java -version 2>&1").read())

openjdk version "17.0.14" 2025-01-21
OpenJDK Runtime Environment Homebrew (build 17.0.14+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.14+0, mixed mode, sharing)



In [2]:
spark.stop()

NameError: name 'spark' is not defined

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import LongType
import os

# 1️⃣ Spark 세션 생성
spark = SparkSession.builder.appName("Yellow_Taxi_Anlaysis").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

# 2️⃣ 모든 .parquet 파일 경로 자동으로 불러오기
directory_path = "/Users/admin/Desktop/GitHub/softeer/과제/M4/NYC_TLC_Trip_Data/"
file_paths = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith(".parquet")]

# 3️⃣ 모든 파일에 대해 형변환 처리 및 병합
df_list = []

for file_path in file_paths:
    # 4️⃣ 각 파일 읽기
    df = spark.read.parquet(file_path)
    
    # 5️⃣ 형변환 (필요한 컬럼에 대해)
    df = df.withColumn("VendorID", col("VendorID").cast(LongType())) \
           .withColumn("PULocationID", col("PULocationID").cast(LongType())) \
           .withColumn("DOLocationID", col("DOLocationID").cast(LongType())) \
           .withColumn("passenger_count", col("passenger_count").cast(LongType())) \
           .withColumn("RatecodeID", col("RatecodeID").cast(LongType()))
    
    # 6️⃣ 변환된 DataFrame 리스트에 추가
    df_list.append(df)

# 7️⃣ 병합된 DataFrame 생성
final_df = df_list[0]
for df in df_list[1:]:
    final_df = final_df.union(df)

# 8️⃣ 결과 출력 (상위 5개 행)
final_df.show(5)

# 9️⃣ 필요시 저장
# final_df.write.parquet("/path/to/save/final_output.parquet")


In [None]:
from pyspark.sql.functions import col, sum as spark_sum

# 각 컬럼별 NULL 값 개수 확인
null_counts_df = final_df.select(
    [spark_sum(col(c).isNull().cast("int")).alias(c) for c in final_df.columns]
)

# Pandas로 변환하여 출력
null_counts_pd = null_counts_df.toPandas()
null_counts_pd

In [None]:
final_df.printSchema()

In [None]:
# 3️⃣ 시간 데이터 변환 (Timestamp 변환 및 trip_duration 추가)
final_df = final_df.withColumn("tpep_pickup_datetime", to_timestamp(col("tpep_pickup_datetime"))) \
                   .withColumn("tpep_dropoff_datetime", to_timestamp(col("tpep_dropoff_datetime")))

In [None]:
import pyspark.pandas as ps

# PySpark DataFrame → Pandas API on Spark DataFrame 변환
ps_df = final_df.pandas_api()

# Pandas처럼 평균 계산 가능
avg_trip_duration = (ps_df["tpep_dropoff_datetime"]-ps_df["tpep_pickup_datetime"]).mean()
avg_trip_distance = ps_df["trip_distance"].mean()

# 결과 출력
print(f"📌 평균 이동 시간: {avg_trip_duration:.2f} 초")
print(f"📌 평균 이동 거리: {avg_trip_distance:.2f} 마일")

In [None]:
pip install numpy==1.23.5

In [None]:
pip install ace_tools

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import LongType, IntegerType, DoubleType, StringType

# 스파크 세션 시작
spark = SparkSession.builder \
    .appName("ParquetFileReader") \
    .getOrCreate()

# 벡터화된 리더 비활성화
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

# 여러 파일을 읽기 전에 모든 컬럼을 String 타입으로 강제 변환
df = spark.read.parquet("file:///Users/admin/Desktop/GitHub/softeer/과제/M4/NYC_TLC_Trip_Data/*.parquet")

# 모든 컬럼을 String 타입으로 변환
for col_name in df.columns:
    df = df.withColumn(col_name, col(col_name).cast(StringType()))

# 필요한 타입으로 변환
df = df.withColumn("VendorID", col("VendorID").cast(LongType()))
df = df.withColumn("PULocationID", col("PULocationID").cast(LongType()))
df = df.withColumn("DOLocationID", col("DOLocationID").cast(LongType()))
df = df.withColumn("passenger_count", col("passenger_count").cast(IntegerType()))
df = df.withColumn("fare_amount", col("fare_amount").cast(DoubleType()))
df = df.withColumn("total_amount", col("total_amount").cast(DoubleType()))

# 스키마 출력
df.printSchema()

# 데이터 확인 (상위 5개 행 출력)
df.show(5)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import LongType

# 1️⃣ Spark 세션 생성
spark = SparkSession.builder.appName("YellowTaxiMerge").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

# 2️⃣ 모든 Parquet 파일을 읽어서 하나의 DataFrame으로 병합
file_path = "/Users/admin/Desktop/GitHub/softeer/과제/M4/NYC_TLC_Trip_Data/*.parquet"
df = spark.read.option("mergeSchema", "false").parquet(file_path)

# 3️⃣ 수동으로 컬럼 타입 강제 변환 (LongType으로 변환)
df = df.withColumn("VendorID", col("VendorID").cast(LongType()))
df = df.withColumn("PULocationID", col("PULocationID").cast(LongType()))
df = df.withColumn("DOLocationID", col("DOLocationID").cast(LongType()))
df = df.withColumn("passenger_count", col("passenger_count").cast(LongType()))
df = df.withColumn("payment_type", col("payment_type").cast(LongType()))
df = df.withColumn("fare_amount", col("fare_amount").cast(LongType()))
df = df.withColumn("extra", col("extra").cast(LongType()))
df = df.withColumn("mta_tax", col("mta_tax").cast(LongType()))
df = df.withColumn("tip_amount", col("tip_amount").cast(LongType()))
df = df.withColumn("tolls_amount", col("tolls_amount").cast(LongType()))
df = df.withColumn("improvement_surcharge", col("improvement_surcharge").cast(LongType()))
df = df.withColumn("total_amount", col("total_amount").cast(LongType()))
df = df.withColumn("congestion_surcharge", col("congestion_surcharge").cast(LongType()))
df = df.withColumn("airport_fee", col("airport_fee").cast(LongType()))

# 4️⃣ 스키마 출력
df.printSchema()

# 5️⃣ 데이터 확인 (상위 5개 행 출력)
df.take(5)




In [None]:
from pyspark.sql import SparkSession

# SparkSession 생성
spark = SparkSession.builder \
    .appName("Schema Check") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

# 파일 목록 가져오기
import glob

parquet_files = glob.glob("/Users/admin/Desktop/GitHub/softeer/과제/M4/NYC_TLC_Trip_Data/*.parquet")

# 각 파일의 스키마 확인
for file in parquet_files:
    print(f"\n📂 파일: {file}")
    df = spark.read.parquet(file)
    df.printSchema()


In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

# 기존 SparkContext 종료
if SparkContext._active_spark_context:
    SparkContext._active_spark_context.stop()

# 새로운 Spark 세션 생성
spark = SparkSession.builder.appName("YellowTaxiMerge").getOrCreate()


In [None]:
jps

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Spark 세션 생성
spark = SparkSession.builder.appName("Yellow Taxi Data").config("spark.driver.bindAddress", "127.0.0.1").config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow").getOrCreate()

# 전체 디렉터리에서 Parquet 파일 읽기
df = spark.read.parquet("NYC_TLC_Trip_Data/") \
    .withColumn("VendorID", col("VendorID").cast("bigint"))  # VendorID를 bigint로 변환

# 스키마 확인 및 데이터 출력
df.printSchema()
df.show(5)

In [None]:
pip install findspark

In [None]:
# 기존 SparkContext가 있는 경우 종료
from pyspark.sql import SparkSession

if 'spark' in locals():
    spark.stop()
    print("Existing SparkSession stopped.")



In [None]:
from pyspark.sql import SparkSession

# 기존 SparkSession 종료
try:
    spark = SparkSession.builder.getOrCreate()
    spark.stop()
except Exception as e:
    print("No existing SparkSession to stop:", e)


In [None]:
spark = SparkSession.builder \
    .appName("Test Spark Session") \
    .getOrCreate()

print(spark)
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# 기존 SparkSession 및 SparkContext 종료
try:
    spark = SparkSession.builder.getOrCreate()
    spark.stop()
    print("Existing SparkSession stopped successfully.")
except Exception as e:
    print("No active SparkSession or failed to stop:", e)


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("TestApp") \
    .getOrCreate()

print("SparkSession created successfully.")
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# 기존 SparkSession 종료
try:
    spark = SparkSession.builder.getOrCreate()
    spark.stop()
    print("Existing SparkSession stopped.")
except Exception as e:
    print("No active SparkSession or failed to stop:", e)

# 새로운 SparkSession 생성
spark = SparkSession.builder \
    .appName("TestApp") \
    .master("local[*]") \
    .getOrCreate()

print("SparkSession created successfully.")

# Spark 종료
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# 기존 SparkSession 종료
try:
    spark = SparkSession.builder.getOrCreate()
    spark.stop()
    print("Existing SparkSession stopped.")
except Exception as e:
    print("No active SparkSession or failed to stop:", e)

# 새로운 SparkSession 생성
spark = SparkSession.builder \
    .appName("TestApp") \
    .master("local[*]") \
    .getOrCreate()

print("SparkSession created successfully.")

# Spark 종료
spark.stop()


In [None]:
from pyspark.sql import SparkSession

# SparkSession 생성
spark = SparkSession.builder \
    .appName("TestApp") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

print("SparkSession created successfully.")

# Spark 종료
spark.stop()


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("TestApp") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.driver.port", "4041") \
    .getOrCreate()

print("SparkSession created successfully.")

# Spark 종료
spark.stop()


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("TestApp") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.driver.port", "4041") \
    .config("spark.executor.extraJavaOptions", "-Djava.net.preferIPv4Stack=true") \
    .config("spark.driver.extraJavaOptions", "-Djava.net.preferIPv4Stack=true") \
    .getOrCreate()

print("SparkSession created successfully.")

# Spark 종료
spark.stop()


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("TestApp") \
    .master("local[*]") \
    .getOrCreate()

print("SparkSession created successfully.")
spark.stop()


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LocalSparkTest") \
    .master("local[1]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

print(spark.version)
spark.stop()


In [None]:
spark.stop()