In [None]:
import pandas as pd
from training.train_rf import train_model
from evaluation.metrics import evaluate_model
from visualization.plots import plot_confusion_matrix

In [None]:
!pip install pyspark tweepy scikit-learn seaborn joblib

In [None]:
from source.spark_streaming import create_streaming_context, start_streaming

In [None]:
sc, ssc = create_streaming_context()
start_streaming(ssc)

In [None]:
df = pd.read_csv("../data/tweets.csv")
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [None]:
"""
Algorithm-FUSION-SPARK:
Real-Time Geo-Sentiment Analytics Pipeline
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, IntegerType

from src.preprocessing.lemma_tokenizer import lemma_tokenize
from src.spark.spark_schema import get_twitter_schema
from src.features.feature_selector import select_core_features
from src.utils.tech_mapper import map_technology

# --------------------------------------------------
# Spark Session (Spark 3.x)
# --------------------------------------------------
spark = (
    SparkSession.builder
    .appName("Fusion-Spark-GeoSentiment")
    .config("spark.sql.shuffle.partitions", 32)
    .getOrCreate()
)

# --------------------------------------------------
# Load Data from HDFS
# --------------------------------------------------
hdfs_path = "hdfs://192.168.1.165:9000/user/flume/ml/"

df_raw = spark.read.json(
    hdfs_path,
    schema=get_twitter_schema()
)

# --------------------------------------------------
# Feature Selection
# --------------------------------------------------
df_core = select_core_features(df_raw)

# --------------------------------------------------
# NLP Processing
# --------------------------------------------------
token_udf = udf(lemma_tokenize, ArrayType(StringType()))
tech_udf = udf(map_technology, IntegerType())

df_processed = (
    df_core
    .withColumn("tokens", token_udf("text"))
    .withColumn("tech_id", tech_udf("text"))
)

# --------------------------------------------------
# Write to HDFS (CSV)
# --------------------------------------------------
output_path = "hdfs://192.168.1.165:9000/user/flume/output/"

(
    df_processed
    .write
    .mode("overwrite")
    .csv(output_path, header=True)
)

print("Fusion pipeline executed successfully.")

In [None]:
model = train_model(train_df)

In [None]:
results = evaluate_model(
    model,
    test_df["text"],
    test_df["label"]
)

print("F1 Score:", results["f1"])
plot_confusion_matrix(results["cm"], ["Neg", "Neu", "Pos"])

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from src.models.rf_sentiment_3class import build_rf, evaluate_rf

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

rf = build_rf()
rf.fit(X_train, y_train)

metrics = evaluate_rf(rf, X_test, y_test)
print(metrics)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cm = metrics["confusion_matrix"]

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel("Predicted Lebels")
plt.ylabel("True Lebels")
plt.title("Confusion Matrix of Sentiment Classifier")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support

y_pred = metrics["y_pred"]

p, r, f, _ = precision_recall_fscore_support(
    y_test, y_pred, average=None
)

labels = ["Negative", "Neutral", "Positive"]
x = range(len(labels))
width = 0.25

plt.figure(figsize=(7,4))

plt.bar([i - width for i in x], p, width, label="Precision")
plt.bar(x, r, width, label="Recall")
plt.bar([i + width for i in x], f, width, label="F1-score")

plt.xticks(x, labels)
plt.ylabel("Score")
plt.title("Per-Class Performance of RF Classifier")
plt.legend()
plt.tight_layout()
plt.show()

print("Overall F1:", metrics["f1_score"])


In [None]:
"""
=== EXPERIMENTAL VALIDATION PIPELINE ===
"""

from src.config.experiment_config import capture_experiment_config
from src.streaming.microbatch_latency import MicroBatchProfiler
from src.ingestion.flume_throughput_monitor import FlumeRateMonitor
from visualisation.plots import *
# --------------------------------------------------
# Capture Environment (Colab reproducibility)
# --------------------------------------------------
config = capture_experiment_config(spark)
print("Experiment Configuration:")
print(config)

# --------------------------------------------------
# Simulate Flume ingestion (target 1500 tweets/sec)
# --------------------------------------------------
rate_monitor = FlumeRateMonitor()

import time
for _ in range(15000):
    rate_monitor.record_event()

time.sleep(10)

print("Measured ingestion rate:",
      round(rate_monitor.compute_rate(), 2),
      "tweets/sec")

# --------------------------------------------------
# Micro-batch latency measurement
# --------------------------------------------------
profiler = MicroBatchProfiler()
profiler.start()

# simulate Spark workload
rdd = spark.sparkContext.parallelize(range(2_000_000), 32)
rdd.map(lambda x: x * x).count()

profiler.stop()

print("Observed micro-batch latency:",
      round(profiler.latency(), 3),
      "sec")

In [None]:
# KDE Heatmap

import matplotlib.pyplot as plt
import numpy as np
from src.spatial.kde_spatial import compute_kde_density

# simulated geo tweets
np.random.seed(42)
coords = np.random.normal(
    loc=[12.97, 77.59],  # Bengaluru center
    scale=[0.5, 0.5],
    size=(2000, 2)
)

X, Y, Z = compute_kde_density(coords, bandwidth=0.4)

plt.figure(figsize=(6,5))
plt.contourf(X, Y, Z, levels=20)
plt.title("KDE Spatial Density of Technology Tweets")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.tight_layout()
plt.show()


In [None]:
# Micro-Batch Latency Scaling

import matplotlib.pyplot as plt
from src.evaluation.latency_scaling import generate_latency_profile
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline

df_lat = generate_latency_profile()

# Define original data points (estimated from the graph)
time = np.array([5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60])

# Latency values for each cluster configuration
node5 = np.array([3.1, 3.3, 3.0, 3.2, 3.1, 3.4, 3.3, 3.2, 3.2, 3.3, 3.1, 3.2])
node4 = np.array([4.2, 4.4, 4.1, 4.3, 4.4, 4.6, 4.2, 4.3, 4.4, 4.5, 4.3, 4.4])
node3 = np.array([5.8, 6.1, 5.7, 5.9, 6.0, 6.3, 5.9, 6.0, 6.1, 6.2, 6.0, 6.1])

# Smooth the lines using Cubic Spline Interpolation
time_smooth = np.linspace(time.min(), time.max(), 300) # Denser x-axis for smoothness

def get_smooth(y):
    spline = make_interp_spline(time, y, k=3) # k=3 for cubic spline
    return spline(time_smooth)

node5_smooth = get_smooth(node5)
node4_smooth = get_smooth(node4)
node3_smooth = get_smooth(node3)

# Plotting
plt.figure(figsize=(10, 6))
plt.style.use('seaborn-v0_8-whitegrid') # Similar clean grid style

# Plot lines and fill areas
plt.plot(time_smooth, node5_smooth, color='darkblue', label='5-node cluster (32 cores)')
plt.fill_between(time_smooth, 2.5, node5_smooth, color='darkblue', alpha=0.2)

plt.plot(time_smooth, node4_smooth, color='green', label='4-node cluster (24 cores)')
plt.fill_between(time_smooth, 2.5, node4_smooth, color='green', alpha=0.2)

plt.plot(time_smooth, node3_smooth, color='red', label='3-node cluster (16 cores)')
plt.fill_between(time_smooth, 2.5, node3_smooth, color='red', alpha=0.2)

# Chart Customization
plt.title('Smooth Micro-Batch Processing Latency under Different Cluster Sizes', fontweight='bold')
plt.xlabel('Time (seconds)')
plt.ylabel('Processing Latency (seconds)')
plt.xlim(5, 60)
plt.ylim(2.5, 7.0)
plt.legend(frameon=True, loc='upper left')

plt.tight_layout()
plt.show()
