In [None]:
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews_clean.csv -O
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/w.csv -O
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/bias.csv -O
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/data_for_svm.csv -O

In [197]:
import os

os.environ["JAVA_HOME"] = r"C:\PROGRA~1\Java\jdk-11.0.27"
os.environ["SPARK_HOME"] = r"C:\PROGRA~1\Spark\spark-3.5.5-bin-hadoop3"

os.environ["PATH"] += os.pathsep + os.path.join(os.environ["JAVA_HOME"], "bin")
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["SPARK_HOME"], "bin")

import findspark
findspark.init(r"C:\Program Files\Spark\spark-3.5.5-bin-hadoop3")

from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local[*]")
         .appName("AG news")
         .getOrCreate()
        )

agnews = spark.read.csv("agnews_clean.csv", inferSchema=True, header=True)

# turning the second column from a string to an array
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType
agnews = agnews.withColumn('filtered', F.from_json('filtered', ArrayType(StringType())))

In [198]:
agnews.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [199]:
# each row contains the document id and a list of filtered words
agnews.show(5, truncate=30)

+---+------------------------------+
|_c0|                      filtered|
+---+------------------------------+
|  0|[wall, st, bears, claw, bac...|
|  1|[carlyle, looks, toward, co...|
|  2|[oil, economy, cloud, stock...|
|  3|[iraq, halts, oil, exports,...|
|  4|[oil, prices, soar, time, r...|
+---+------------------------------+
only showing top 5 rows



In [200]:
from pyspark.sql.functions import col,split
import pyspark.sql.functions as F
words_in_doc = agnews.withColumn("unique_words", F.array_distinct(col("filtered"))).drop("filtered")
words_in_doc = words_in_doc.select(F.explode(col("unique_words")).alias("word"))

# this has num docs containing t
docs_with_word = words_in_doc.groupBy("word").count()
docs_with_word.show(5, truncate=False)
# this is the num docs in D
num_docs = agnews.select("filtered").count()
# this has num terms in doc
num_terms = agnews.withColumn("num_terms", F.size(col("filtered"))).drop("filtered")
num_terms.show(5,truncate=False)
# this gets the number of occurences in doc
occurences = agnews.select(col("_c0"), F.explode(col("filtered")).alias("term"))
occurences = occurences.groupBy("_c0", "term").count()
occurences.sort(col("_c0")).show(10, truncate=False)

+-------+-----+
|word   |count|
+-------+-----+
|purnomo|23   |
|doubts |172  |
|hope   |696  |
|47s    |3    |
|online |2444 |
+-------+-----+
only showing top 5 rows

+---+---------+
|_c0|num_terms|
+---+---------+
|0  |18       |
|1  |27       |
|2  |24       |
|3  |28       |
|4  |30       |
+---+---------+
only showing top 5 rows

+---+-------+-----+
|_c0|term   |count|
+---+-------+-----+
|0  |green  |1    |
|0  |seeing |1    |
|0  |short  |1    |
|0  |sellers|1    |
|0  |ultra  |1    |
|0  |band   |1    |
|0  |street |1    |
|0  |wall   |2    |
|0  |reuters|2    |
|0  |black  |1    |
+---+-------+-----+
only showing top 10 rows



In [201]:
# here I will be working on calculating tf
# start with explloding unique words from each doc
tf = agnews.select(col("_c0"), F.explode(F.array_distinct(col("filtered"))).alias("term"))
# get the occurences for each word by doc
tf = tf.join(occurences, on=["_c0", "term"], how="inner")
# get the number of terms in the doc
tf = tf.join(num_terms, on="_c0", how="inner")
# use the ratio to get the tf value
tf  = tf.withColumn("tf", col("count")/col("num_terms"))
tf.sort("_c0").show(10)

+---+---------+-----+---------+-------------------+
|_c0|     term|count|num_terms|                 tf|
+---+---------+-----+---------+-------------------+
|  0|    bears|    1|       18|0.05555555555555555|
|  0|     band|    1|       18|0.05555555555555555|
|  0|     claw|    1|       18|0.05555555555555555|
|  0|   street|    1|       18|0.05555555555555555|
|  0|     back|    1|       18|0.05555555555555555|
|  0|    ultra|    1|       18|0.05555555555555555|
|  0|    black|    1|       18|0.05555555555555555|
|  0|dwindling|    1|       18|0.05555555555555555|
|  0|    short|    1|       18|0.05555555555555555|
|  0|  reuters|    2|       18| 0.1111111111111111|
+---+---------+-----+---------+-------------------+
only showing top 10 rows



In [202]:
# now calculate the idf
# take the distinct words
idf = words_in_doc.select("word").distinct()
# get the number of docs they appear in
idf = idf.join(docs_with_word, on=["word"], how="inner")
# calculate idf
idf = idf.withColumn("idf", F.log(num_docs/col("count")))
idf = idf.withColumnRenamed("word", "term")
idf.show(10)

+-----------+-----+------------------+
|       term|count|               idf|
+-----------+-----+------------------+
|    purnomo|   23| 8.621161433963676|
|     doubts|  172| 6.609161173079373|
|       hope|  696| 5.211305989558407|
|        47s|    3|10.658043361224717|
|     online| 2444|3.9552643296013406|
|     waters|  138| 6.829401964735622|
|     filing|  339| 5.930655542512376|
|      still| 2281|4.0242864276084385|
|  theorists|    7| 9.810745500837513|
|transmitted|   14| 9.117598320277567|
+-----------+-----+------------------+
only showing top 10 rows



In [203]:
# combine it all and calculate
final = tf.join(idf, on="term", how="inner")
final = final.withColumn("tf-idf", col("tf")*col("idf"))
# filter to show ~250 unique words in the first 10 documents
final.filter((final["_c0"] < 10)).select("_c0", "term", "tf-idf").sort("_c0").show(250)

+---+--------------+-------------------+
|_c0|          term|             tf-idf|
+---+--------------+-------------------+
|  0|         bears| 0.3372044607529448|
|  0|         black| 0.2953171727366614|
|  0|          band| 0.3643421454792778|
|  0|         short| 0.2773120373951269|
|  0|          wall| 0.5115985326511431|
|  0|        seeing|0.37743394553516213|
|  0|       reuters|0.24754017186645658|
|  0|        street|0.24678348986493034|
|  0|        cynics|  0.563734318747707|
|  0|         ultra| 0.4125512394225831|
|  0|         green| 0.2877107940095433|
|  0|          claw|  0.499114829314058|
|  0|          back| 0.1892216338539946|
|  0|            st| 0.2584728642725166|
|  0|       sellers| 0.4468379768438066|
|  0|     dwindling| 0.4572386180709258|
|  1|          well|0.17053284421704767|
|  1|         group|0.12468100563149095|
|  1|          bets|0.27861293130724324|
|  1|       private| 0.1929050573011279|
|  1|        placed| 0.2284965552404658|
|  1|         pl

In [204]:
import numpy as np

spark = (SparkSession.builder
         .master("local[*]")
         .appName("SVM")
         .getOrCreate()
        )

# --------------------------------------
# Load data
# --------------------------------------
# Load feature and label data
data_df = spark.read.csv("data_for_svm.csv", header=False, inferSchema=True)

# Last column is label
num_features = len(data_df.columns) - 1
X_cols = data_df.columns[:num_features]
y_col = data_df.columns[-1]

X = data_df.select(X_cols)
y = data_df.select(y_col)

# Load weights and bias
w = np.loadtxt("w.csv", delimiter=",")
b = float(np.loadtxt("bias.csv"))
lambda_ = 0.01

In [205]:
### TASK 1 ###
def loss(row, w, b):
    # use dictionary to grab values of row
    row = row.asDict()
    # dot product total
    dotProd = 0
    # index for w
    index = 0
    for f in X_cols: 
        # increment dot product
        dotProd += w[index] * row[f]
        index += 1
    # return lossz of row
    return max(0, 1 - (row[y_col] * (dotProd + b)))

In [206]:
### TASK 2 ###
def loss_SVM(w, b, X, y, lambda_):
    # call mapper to get row losses
    losses = data_df.rdd.map(lambda row: loss(row, w, b))
    # reduce to get total losses
    tot_losses = losses.reduce(lambda x, y: x+y)
    # average loss
    mean_loss = tot_losses/X.count()
    # return reg + mean loss
    reg = lambda_ * np.dot(w,w)
    return reg + mean_loss

In [207]:
### TASK 3 ###
loss_value = loss_SVM(w, b, X, y, lambda_=0.01)
print("SVM Loss:", loss_value)

SVM Loss: 0.9997559286225175


In [208]:
### TASK 4 ###
def predictor(row, w, b):
    # use dictionary to grab values of row
    row = row.asDict()
    # dot product total
    dotProd = 0
    # index for w
    index = 0
    for f in X_cols: 
        # increment dot product
        dotProd += w[index] * row[f]
        index += 1
    # return lossz of row
    return np.sign(dotProd + b)

In [209]:
predictions = data_df.rdd.map(lambda row: predictor(row, w,b))
predictions.take(X.count())

[np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(-1.0),
 np.float64(-1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(-1.0

GENERATIVE AI DISCLOSURE
- helped with dockerfile
- used to load the SVM data