# Example Data Science Notebook

This notebook demonstrates basic data analysis workflows.

In [None]:
import sys
import os
from pathlib import Path


# Add src to path
# sys.path.insert(0, str(Path().resolve().parent / 'src'))

sys.path.append(str(ROOT / "src"))
os.environ["SPARK_LOCAL_DIRS"] = str((ROOT / "spark_tmp").resolve())

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

from ds.analyzer import DataAnalyzer
from de.load.loader import DataLoader

print("Imports successful!")

## Load Sample Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Create sample data
np.random.seed(42)
data = {
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'feature3': np.random.randn(100),
    'target': np.random.randint(0, 2, 100)
}
df = pd.DataFrame(data)
df.head()

## Exploratory Data Analysis

In [None]:
analyzer = DataAnalyzer(df)
analyzer.summary_stats()

In [None]:
# Check for missing values
analyzer.check_missing()

## Visualizations

In [None]:
# Distribution plot
plt.figure(figsize=(12, 4))
for i, col in enumerate(['feature1', 'feature2', 'feature3'], 1):
    plt.subplot(1, 3, i)
    plt.hist(df[col], bins=20, edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## Prepare Data for ML

In [None]:
X_train, X_test, y_train, y_test = analyzer.prepare_features('target')
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    expr,
    lower,
    regexp_replace,
    to_timestamp,
)
from pathlib import Path

from src.common.config import RAW_DIR
from src.de.spark_jobs.traffy_basic_etl import main

RAW_FILE = Path("../data/raw/bangkok_traffy.csv")
OUTPUT_DIR = Path("../data/preprocessed/traffy_cleaned_parquet")

In [None]:
import os, sys
from pathlib import Path

ROOT = Path.cwd().parent
sys.path.append(str(ROOT / "src"))
os.environ["SPARK_LOCAL_DIRS"] = str((ROOT / "spark_tmp").resolve())

from src.de.spark_jobs import traffy_basic_etl as etl

etl.main()


In [None]:
from pathlib import Path
out = Path("data/processed/traffy_cleaned_parquet")
if out.exists():
    if out.is_file():
        out.unlink()
    else:
        import shutil
        shutil.rmtree(out)


In [1]:
%pip install findspark scikit-learn

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["SPARK_HOME"] = "/opt/spark"

In [None]:
# import os
# os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-21"
# os.environ["SPARK_HOME"] = "C:\spark"
# os.environ["HADOOP_HOME"] = "C:\hadoop"

In [5]:
import findspark
findspark.init()

In [6]:
spark_url = 'local'

In [7]:
from pyspark.sql import SparkSession
import os

spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark Tutorial')\
        .config('spark.ui.port', '4040')\
        .getOrCreate()

spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/20 00:28:28 WARN Utils: Your hostname, sira, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/20 00:28:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/20 00:28:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
cd ..

/home/sirav/JekTurnRight_dsde


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [13]:
from src.de.spark_jobs.traffy_flood_etl import run_traffy_flood_etl
import os
from pathlib import Path

ROOT = Path.cwd()

INPUT = f"{ROOT}/data/raw/bangkok_traffy.csv"
CLEANED_OUT = f"{ROOT}/data/processed/traffy_clean.parquet"
FLOOD_TS_OUT = f"{ROOT}/data/processed/flood_daily_by_district.parquet"

run_traffy_flood_etl(
    spark,
    input_path=INPUT,
    cleaned_output_path=CLEANED_OUT,
    flood_ts_output_path=FLOOD_TS_OUT
)
spark.stop()


üöÄ RUNNING TRAFFY FLOOD ETL PIPELINE
[STEP] Loading raw Traffy CSV...


                                                                                

[STEP] Cleaning + schema validation + Bangkok filter...
[STEP] Adding time columns...
[STEP] Adding flood_flag column...
[STEP] Aggregating daily flood counts per district...
[STEP] Writing cleaned tickets...


25/11/20 00:31:00 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[STEP] Writing daily flood time series...


                                                                                

‚úÖ ETL COMPLETED
   Cleaned data: /home/sirav/JekTurnRight_dsde/data/processed/traffy_clean.parquet
   Flood TS: /home/sirav/JekTurnRight_dsde/data/processed/flood_daily_by_district.parquet


In [11]:
%pip install pandas matplotlib seaborn scikit-learn requests

IOStream.flush timed out
Collecting matplotlib
  Downloading matplotlib-3.10.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.7/8.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m294.9/294.9 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m64.7/64.7 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting fonttools>=4.22.0
  Downloading fo

In [14]:
# use spark read this /home/sirav/JekTurnRight_dsde/data/processed/traffy_clean.parquet
os.environ["SPARK_HOME"] = "/opt/spark"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("JekTurnRight Analysis") \
    .getOrCreate()
df = spark.read.parquet("/home/sirav/JekTurnRight_dsde/data/processed/traffy_clean.parquet")
df.show(5)

df.printSchema()
df.count()
df.select("district").distinct().show()
df.groupBy("district").count().orderBy("count", ascending=False).show()
df.filter(df["district"] == "‡∏ö‡∏≤‡∏á‡∏£‡∏±‡∏Å").show(5)
df.createOrReplaceTempView("traffy_data")
# result = spark.sql("SELECT district, COUNT(*) as report_count FROM traffy_data GROUP BY
#     district ORDER BY report_count DESC")
# result.show()



+-----------+-------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+---------+----+------------+--------------------+--------------------+----------+----+-----+---+----+-------+--------+
|  ticket_id|         type|        organization|             comment|               photo|         photo_after|            coords|             address|subdistrict|district|     province|    state|star|count_reopen|       last_activity|           timestamp|      date|year|month|day|hour|weekday|is_flood|
+-----------+-------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+---------+----+------------+--------------------+--------------------+----------+----+-----+---+----+-------+--------+
|2021-4D9Y98|           {}|‡πÄ‡∏Ç‡∏ï‡∏•‡∏≤‡∏î‡∏û‡∏£‡πâ‡∏≤‡∏ß,‡∏Å‡∏≤‡∏£‡πÑ‡∏ü...|‡∏´‡∏

In [19]:
# read /home/sirav/JekTurnRight_dsde/data/processed/flood_daily_by_district.parquet
flood_df = spark.read.parquet("/home/sirav/JekTurnRight_dsde/data/processed/flood_daily_by_district.parquet")
# flood_df.show()

# sort by date
flood_df = flood_df.orderBy("date")
flood_df.show()

+----------+-----------+---------------------+
|      date|   district|flood_complaint_count|
+----------+-----------+---------------------+
|2021-09-19|     ‡∏õ‡∏£‡∏∞‡πÄ‡∏ß‡∏®|                    1|
|2021-12-22|     ‡∏õ‡∏£‡∏∞‡πÄ‡∏ß‡∏®|                    1|
|2022-01-14|       ‡∏™‡∏≤‡∏ó‡∏£|                    1|
|2022-02-04|       ‡∏™‡∏≤‡∏ó‡∏£|                    1|
|2022-02-26|      ‡∏î‡∏∏‡∏™‡∏¥‡∏ï|                    1|
|2022-03-03|   ‡∏ï‡∏•‡∏¥‡πà‡∏á‡∏ä‡∏±‡∏ô|                    1|
|2022-04-01|      ‡∏ö‡∏≤‡∏á‡∏ô‡∏≤|                    1|
|2022-05-18|   ‡∏•‡∏≤‡∏î‡∏û‡∏£‡πâ‡∏≤‡∏ß|                    1|
|2022-05-20|   ‡∏•‡∏≤‡∏î‡∏û‡∏£‡πâ‡∏≤‡∏ß|                    1|
|2022-05-23|     ‡∏ö‡∏≤‡∏á‡πÄ‡∏Ç‡∏ô|                    1|
|2022-05-24|   ‡∏•‡∏≤‡∏î‡∏û‡∏£‡πâ‡∏≤‡∏ß|                    1|
|2022-05-25|   ‡∏•‡∏≤‡∏î‡∏û‡∏£‡πâ‡∏≤‡∏ß|                    3|
|2022-05-26|    ‡∏à‡∏ï‡∏∏‡∏à‡∏±‡∏Å‡∏£|                    1|
|2022-05-26|    ‡∏ö‡∏∂‡∏á‡∏Å‡∏∏‡πà‡∏°|                    1|
|2022-05-27|   ‡∏´‡πâ

In [21]:
from pathlib import Path
from src.common.config import PROJECT_ROOT, RAW_DIR
print("FILE:", Path(__file__).resolve())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)


NameError: name '__file__' is not defined