In [None]:
import os

# Replace source_dir with your path to ?/wisdm-dataset/arff_files/phone/accel
# The dataset can be found downloaded here: 
# https://archive.ics.uci.edu/dataset/507/wisdm+smartphone+and+smartwatch+activity+and+biometrics+dataset

#source_dir = "/home/work/Final_Project/data/wisdm-dataset/arff_files/phone/accel"
output_dir = os.path.join(source_dir, "csv_clean")

os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(source_dir):
    if filename.endswith(".arff"):
        with open(os.path.join(source_dir, filename), "r") as infile:
            lines = infile.readlines()
        
        # Find the line where data starts
        try:
            start_index = lines.index("@data\n") + 1
        except ValueError:
            continue  # Skip malformed ARFF

        data_lines = lines[start_index:]
        output_file = os.path.join(output_dir, filename.replace(".arff", ".csv"))

        with open(output_file, "w") as outfile:
            outfile.writelines(data_lines)

print("Finished cleaning ARFF files into CSV format.")


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WISDM_EDA").getOrCreate()

df = spark.read.csv(
    "file:///home/work/Final_Project/data/wisdm-dataset/arff_files/phone/accel/csv_clean/*.csv",
    header=False, inferSchema=True
)

# Optionally rename first column as 'activity'
df = df.withColumnRenamed("_c0", "activity")
df.show(5)


In [None]:
# Print inferred schema
df.printSchema()

# Show first few rows
df.show(5, truncate=False)


In [None]:
# Show descriptive stats (mean, stddev, min, max) for all numeric columns
df.describe().show()

In [None]:
from pyspark.sql.functions import col

# Count frequency of each activity code (e.g., A, B, C...)
activity_counts = df.groupBy("activity").count().orderBy("count", ascending=False)
activity_counts.show()


In [None]:
import matplotlib.pyplot as plt

# Convert to Pandas for plotting (only safe for small datasets)
activity_pd = activity_counts.toPandas()

# Plot bar chart
activity_pd.plot(kind="bar", x="activity", y="count", legend=False)
plt.title("Activity Code Distribution")
plt.xlabel("Activity Code")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Select numeric columns
numeric_cols = [f"_c{i}" for i in range(1, 93)]
numeric_df = df.select([col(c).cast("double") for c in numeric_cols])

# Sample and convert to Pandas
numeric_sample = numeric_df.sample(False, 0.05).toPandas()

# Compute correlation matrix
corr_matrix = numeric_sample.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
plt.imshow(corr_matrix, interpolation='nearest', cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Correlation')

# Label axes
tick_marks = np.arange(len(corr_matrix.columns))
plt.xticks(tick_marks, corr_matrix.columns, rotation=90, fontsize=6)
plt.yticks(tick_marks, corr_matrix.columns, fontsize=6)

plt.title("Features Correlation Heatmap")
plt.tight_layout()
plt.show()
