In [1]:
from classes.NewsLoader import NewsLoader
from classes.TextPreprocessing import TextPreprocessing
from classes.NewsSentimentModelling import NewsSentimentModelling
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
            .appName("News Sentiment Analysis") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()

25/04/13 18:07:36 WARN Utils: Your hostname, Macpad5Pro. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/13 18:07:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/13 18:07:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
loader = NewsLoader("raw_data/news1.json")
df = loader.load_data(spark)
df = loader.add_sentiment_label(df, "summary")
df.show()



                                                                                

+--------------------+--------+--------------------+------------------+-------------------+-------+-----------+--------------------+--------------------+---------------+---------+
|                body|keywords|                link|          location|       publish_time|section|sub_section|             summary|               title|sentiment_score|sentiment|
+--------------------+--------+--------------------+------------------+-------------------+-------+-----------+--------------------+--------------------+---------------+---------+
|IPOH: A family of...|      []|https://www.thest...|              IPOH|2025-03-31 20:44:00|   News|     Nation|IPOH: A family of...|Family of seven s...|        -0.5267| negative|
|KUALA LUMPUR: The...|      []|https://www.thest...|      KUALA LUMPUR|2025-03-31 19:59:00|   News|     Nation|KUALA LUMPUR: The...|Myanmar quake: SM...|         -0.595| negative|
|BALING: A sombre ...|      []|https://www.thest...|            BALING|2025-03-31 19:24:00|   News| 

# EDA

In [4]:
df.count()

200

In [5]:
df.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|   91|
|  neutral|   38|
| negative|   71|
+---------+-----+



# Preprocessing

In [4]:
tp = TextPreprocessing()

In [5]:
df = tp.remove_duplicate(df)

In [6]:
df = tp.lowercase(df, "summary")
df.select("summary", "sentiment").show()

+--------------------+---------+
|             summary|sentiment|
+--------------------+---------+
|kuala lumpur: a v...| negative|
|kota kinabalu: la...| negative|
|kuala lumpur: the...| positive|
|kuala terengganu:...|  neutral|
|chukai: prime min...| positive|
|sepang: transport...| negative|
|nibong tebal: the...| positive|
|kota kinabalu: a ...| negative|
|seremban: eleven ...|  neutral|
|kuala lumpur: for...| positive|
|johor baru: autho...|  neutral|
|ipoh: about 20,00...|  neutral|
|kuala lumpur: in ...| negative|
|kuala lumpur: a m...| negative|
|kuala lumpur: the...| negative|
|kuala lumpur: the...| positive|
|tampin: negri sem...| positive|
|kota kinabalu: wa...| positive|
|johor baru: her m...|  neutral|
|luala lumpur: pkr...| positive|
+--------------------+---------+
only showing top 20 rows



In [7]:
df = tp.remove_special(df, "summary")
df.select("summary", "sentiment").show()

+--------------------+---------+
|             summary|sentiment|
+--------------------+---------+
|kuala lumpur a vi...| negative|
|kota kinabalu lab...| negative|
|kuala lumpur the ...| positive|
|kuala terengganu ...|  neutral|
|chukai prime mini...| positive|
|sepang transport ...| negative|
|nibong tebal the ...| positive|
|kota kinabalu a f...| negative|
|seremban eleven o...|  neutral|
|kuala lumpur form...| positive|
|johor baru author...|  neutral|
|ipoh about  peopl...|  neutral|
|kuala lumpur in l...| negative|
|kuala lumpur a ma...| negative|
|kuala lumpur the ...| negative|
|kuala lumpur the ...| positive|
|tampin negri semb...| positive|
|kota kinabalu war...| positive|
|johor baru her ma...|  neutral|
|luala lumpur pkr ...| positive|
+--------------------+---------+
only showing top 20 rows



In [8]:
tpModel = tp.fitPipeline(df, "summary", "sentiment")

                                                                                

In [9]:
df = tpModel.transform(df)
df.show()

+--------------------+--------+--------------------+----------------+-------------------+--------+-------------+--------------------+--------------------+---------------+---------+------------+--------------------+--------------------+--------------------+--------------------+
|                body|keywords|                link|        location|       publish_time| section|  sub_section|             summary|               title|sentiment_score|sentiment|indexedLabel|               words|      filtered_words|        raw_features|            features|
+--------------------+--------+--------------------+----------------+-------------------+--------+-------------+--------------------+--------------------+---------------+---------+------------+--------------------+--------------------+--------------------+--------------------+
|KUALA LUMPUR: A v...|      []|https://www.thest...|    KUALA LUMPUR|2025-03-14 21:43:00|    News|       Nation|kuala lumpur a vi...|Video of robbers ...|        -0.4

In [10]:
tp.save_pipeline_to_hdfs(tpModel, "preprocessingPipeline", "/user/student")

                                                                                

Model successfully saved at: /user/student/preprocessingPipeline


# Modelling

In [11]:
# Assuming `transformed_df` is the DataFrame after pipeline transformation
x_column = "features"        # Feature column after IDF transformation
y_column = "indexedLabel"    # Label column from StringIndexer
train_size = 0.8             # 80% training, 20% testing

# Step 1: Initialize the NewsSentimentModelling class
sentiment_model = NewsSentimentModelling(df, x_column, y_column, train_size)



## 1)Naive Bayes

In [12]:
nb_model = sentiment_model.train_naive_bayes()   # Naïve Bayes
print("Evaluating Naïve Bayes Model:")
sentiment_model.evaluate_model(nb_model)

                                                                                

Evaluating Naïve Bayes Model:
Accuracy: 0.6129
Precision: 0.5305
Recall: 0.6129
F1-score: 0.5687


[0.6129032258064516,
 0.5304659498207885,
 0.6129032258064516,
 0.5686761056780032]

## 2) Logistic Regression

In [13]:
lr_model = sentiment_model.train_logistic_regression(3) 
print("\nEvaluating Logistic Regression Model:")
sentiment_model.evaluate_model(lr_model)

                                                                                


Evaluating Logistic Regression Model:
Accuracy: 0.6129
Precision: 0.5355
Recall: 0.6129
F1-score: 0.5712


[0.6129032258064516,
 0.5354838709677419,
 0.6129032258064516,
 0.5712129786832671]

## 3) Linear SVC

In [15]:
# Step 2: Train a classification model
svm_model = sentiment_model.train_linear_svm(3) # Logistic Regression
print("\nEvaluating Linear SVM Model:")
sentiment_model.evaluate_model(svm_model)


                                                                                


Evaluating Linear SVM Model:
Accuracy: 0.9355
Precision: 0.9409
Recall: 0.9355
F1-score: 0.9365


[0.9354838709677419,
 0.9408602150537635,
 0.9354838709677419,
 0.9364893171344784]

In [16]:
hdfs_path = "/user/student"
sentiment_model.save_model_to_hdfs(nb_model, "naive_bayes", hdfs_path)
sentiment_model.save_model_to_hdfs(lr_model, "logistic_reg", hdfs_path)
sentiment_model.save_model_to_hdfs(svm_model, "LinearSvc", hdfs_path)

Model successfully saved at: /user/student/naive_bayes
Model successfully saved at: /user/student/logistic_reg
Model successfully saved at: /user/student/LinearSvc


In [None]:
# sentiment_model.save_model_as_pkl(nb_model, "naive_bayes")

