In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.ml import PipelineModel
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.feature import IndexToString
from classes.NewsLoader import NewsLoader
from classes.TextPreprocessing import TextPreprocessing
from classes.NewsGraph import NewsGraph

In [3]:
ng = NewsGraph( NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

In [4]:
ng.verify_connection()

✅ Connection successful!


In [5]:
spark = SparkSession.builder \
    .appName("Neo4j") \
    .getOrCreate()

25/04/17 09:57:27 WARN Utils: Your hostname, Macpad5Pro. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/17 09:57:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/17 09:57:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/17 09:57:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
preprocessing_path = "/user/student/preprocessingPipeline"
preprocess = PipelineModel.load(preprocessing_path)

                                                                                

In [7]:
model_path = "/user/student/LinearSvc"
cv_model = CrossValidatorModel.load(model_path)
model = cv_model.bestModel

In [8]:
loader = NewsLoader("raw_data/news1.json")
df = loader.load_data(spark)
df.show()

+--------------------+--------+--------------------+------------------+-------------------+-------+-----------+--------------------+--------------------+
|                body|keywords|                link|          location|       publish_time|section|sub_section|             summary|               title|
+--------------------+--------+--------------------+------------------+-------------------+-------+-----------+--------------------+--------------------+
|IPOH: A family of...|      []|https://www.thest...|              IPOH|2025-03-31 20:44:00|   News|     Nation|IPOH: A family of...|Family of seven s...|
|KUALA LUMPUR: The...|      []|https://www.thest...|      KUALA LUMPUR|2025-03-31 19:59:00|   News|     Nation|KUALA LUMPUR: The...|Myanmar quake: SM...|
|BALING: A sombre ...|      []|https://www.thest...|            BALING|2025-03-31 19:24:00|   News|     Nation|BALING: A sombre ...|Missing autistic ...|
|JOHOR BARU (The S...|      []|https://www.thest...|Location not found|2025-

In [9]:
tp = TextPreprocessing()

In [10]:
df = tp.remove_duplicate(df)

In [11]:
df = tp.lowercase(df, "summary")

In [12]:
df = tp.remove_special(df, "summary")


In [13]:
df = preprocess.transform(df)

In [14]:
df = model.transform(df)

In [15]:
label_indexer_model = preprocess.stages[0]  # Assuming it's the first stage
labels = label_indexer_model.labels

label_reverse = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=labels)
df = label_reverse.transform(df)


In [16]:
df = df.withColumn("publish_time_ts", to_timestamp("publish_time", "yyyy-MM-dd HH:mm:ss"))

In [17]:
df.show()

[Stage 44:>                                                         (0 + 1) / 1]

+--------------------+--------+--------------------+------------------+-------------------+-------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+---------------+-------------------+
|                body|keywords|                link|          location|       publish_time|section|sub_section|             summary|               title|               words|      filtered_words|        raw_features|            features|       rawPrediction|prediction|predicted_label|    publish_time_ts|
+--------------------+--------+--------------------+------------------+-------------------+-------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+---------------+-------------------+
|JAKARTA: China ha...|      []|https://www.thest...|           JAKARTA|2025-03-21 

                                                                                

In [18]:
result_df = df.select("title", "summary", "publish_time_ts", "section", "sub_section", "location",  "predicted_label")
result_df.show()

[Stage 47:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-------------------+-------+-----------+------------------+---------------+
|               title|             summary|    publish_time_ts|section|sub_section|          location|predicted_label|
+--------------------+--------------------+-------------------+-------+-----------+------------------+---------------+
|China backs Malay...|jakarta china has...|2025-03-21 21:17:00|   News|     Nation|           JAKARTA|       positive|
|Sufficient supply...|kota baru the agr...|2025-03-13 21:42:00|   News|     Nation|         KOTA BARU|       positive|
|Cops probing vide...|kuala lumpur poli...|2025-03-26 23:20:00|   News|     Nation|      KUALA LUMPUR|       positive|
|Family heading ho...|kota kinabalu a f...|2025-03-28 20:53:00|   News|     Nation|     KOTA KINABALU|       negative|
|Woman claims male...|kota kinabalu lab...|2025-03-22 21:04:00|   News|     Nation|     KOTA KINABALU|        neutral|
|Raya rush: Traffi...|kuala lumpur traf...|2025-

                                                                                

In [21]:
with open('stop_words.txt', 'r') as f:
    stop_words = f.read().splitlines()

query = """
MERGE (n:News {title: $title})
SET n.summary = $summary,
    n.publish_time = datetime($publish_time)

MERGE (s:Section {name: $section})
MERGE (ss:SubSection {name: $sub_section})
MERGE (ss)-[:PART_OF]->(s)
MERGE (n)-[:BELONGS_TO]->(ss)

MERGE (l:Location {name: $location})
MERGE (n)-[:REPORTED_IN]->(l)

MERGE (c:Category {name: $predicted_label})
MERGE (n)-[:HAS_CATEGORY]->(c)

MERGE (t:Time {date: date($publish_time)})
MERGE (n)-[:PUBLISHED_ON]->(t)

"""


In [20]:
ng.insert_spark_dataframe(result_df, query=query)


TypeError: NewsGraph.insert_spark_dataframe() got an unexpected keyword argument 'stop_words'

In [21]:
ng.close()