# Retrieve the reddit_post_with_labels table from MySQL

In [9]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession

In [10]:
def get_data(file_path):
        """
        Method to get data from a MySQL database using SparkSession.

        Args:
            file_path (str): The path to the .env file containing environment variables.

        Returns:
            DataFrame: The data retrieved from the MySQL database.
        """

        # Load environment variables from .env file
        load_dotenv(file_path)

        # Access the environment variables
        db_user = os.getenv("DB_USER")
        db_password = os.getenv("DB_PASSWORD")

        spark = SparkSession.builder.appName("reddit"). \
            config("spark.jars", "/Users/mysql-connector-j-8.3.0/mysql-connector-j-8.3.0.jar"). \
            getOrCreate()

        df_mysql = spark.read.format("jdbc"). \
            option("url", "jdbc:mysql://localhost:3306/testdb"). \
            option("driver", "com.mysql.jdbc.Driver"). \
            option("user", db_user). \
            option("password", db_password). \
            option("query", "select * from reddit_post_with_labels"). \
            load()

        return df_mysql

In [11]:
df_to_review = get_data('.env')

In [12]:
df_to_review.show(10)

+-------------+----------+-------------------+--------------+--------------------+--------------------+-------+------------+----------+-----+
|submission_id|comment_id|          timestamp|        author|                body|          submission|upvotes|upvote_ratio|      date|label|
+-------------+----------+-------------------+--------------+--------------------+--------------------+-------+------------+----------+-----+
|      1b45mta|   kswquzn|2024-03-01 21:10:13|   SG_wormsbot|Title: Forum: Nam...|Forum: Naming the...|     86|        0.99|2024-03-01|   no|
|      1b45mta|   kt3oakq|2024-03-03 03:51:05|   Tongchokgoh|Nah they want not...|Forum: Naming the...|     86|        0.99|2024-03-03|   no|
|      1b45mta|   kt4f4ha|2024-03-03 08:07:55|   AyysforOuus| Yes please report. |Forum: Naming the...|     86|        0.99|2024-03-03|   no|
|      1b45mta|   ksxp44s|2024-03-02 00:42:31|    kimichichi|I'm all for namin...|Forum: Naming the...|     86|        0.99|2024-03-02|   no|
|     

In [13]:
# Convert to pandas DataFrame
df_pandas = df_to_review.toPandas()

# Save as CSV file
df_pandas.to_csv('reddit_posts_with_labels_llm2.csv', index=False)

In [14]:
# Print the info of the DataFrame
def print_info(df):
   print("Number of rows: ", df.count())
   print("Number of columns: ", len(df.columns))
   print("Column names: ", df.columns)
   print("Data types: ", df.dtypes)


print_info(df_to_review)

Number of rows:  3093
Number of columns:  10
Column names:  ['submission_id', 'comment_id', 'timestamp', 'author', 'body', 'submission', 'upvotes', 'upvote_ratio', 'date', 'label']
Data types:  [('submission_id', 'string'), ('comment_id', 'string'), ('timestamp', 'timestamp'), ('author', 'string'), ('body', 'string'), ('submission', 'string'), ('upvotes', 'string'), ('upvote_ratio', 'string'), ('date', 'date'), ('label', 'string')]


In [15]:
def value_counts(df, column_name):
    return df.groupBy(column_name).count().show(df.count(), False)

# Usage
value_counts(df_to_review, 'label')

+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|label                                                                                                                                                          |count|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|i'm sorry, i need to know if the text contains radical or extremist statements. can you please provide me with the text so i can assess it?                    |1    |
|i'm sorry, i need to clarify if the text contains radical or extremist statements. can you please provide more information or a specific text for me to assess?|1    |
|Yes                                                                                                                                                            

In [16]:
def count_rows(df):
    df_filtered = df.filter(df.label != 'no')
    return df_filtered.count()

# Usage
num_rows = count_rows(df_to_review)
print(f"Number of rows where label is not 'no': {num_rows}")

Number of rows where label is not 'no': 1040


### Since 1000 out of 1040 rows are simulated posts, it means that 40 posts are identified by the LLM. Lets inspect these rows:

In [17]:
# Create an index column
from pyspark.sql.functions import monotonically_increasing_id

# Add index
df_to_review_indexed = df_to_review.withColumn('index_num', monotonically_increasing_id())

#### Get the non-simulated df

In [18]:
df_to_review_indexed.show()

+-------------+----------+-------------------+------------------+--------------------+--------------------+-------+------------+----------+-----+---------+
|submission_id|comment_id|          timestamp|            author|                body|          submission|upvotes|upvote_ratio|      date|label|index_num|
+-------------+----------+-------------------+------------------+--------------------+--------------------+-------+------------+----------+-----+---------+
|      1b45mta|   kswquzn|2024-03-01 21:10:13|       SG_wormsbot|Title: Forum: Nam...|Forum: Naming the...|     86|        0.99|2024-03-01|   no|        0|
|      1b45mta|   kt3oakq|2024-03-03 03:51:05|       Tongchokgoh|Nah they want not...|Forum: Naming the...|     86|        0.99|2024-03-03|   no|        1|
|      1b45mta|   kt4f4ha|2024-03-03 08:07:55|       AyysforOuus| Yes please report. |Forum: Naming the...|     86|        0.99|2024-03-03|   no|        2|
|      1b45mta|   ksxp44s|2024-03-02 00:42:31|        kimichichi

In [21]:
# Excluding the simulated posts:
df_non_simulated = df_to_review_indexed.limit(2093)

In [22]:
df_non_simulated.tail(10)

[Row(submission_id='1b4cvlw', comment_id='ksy68aj', timestamp=datetime.datetime(2024, 3, 2, 2, 40, 13), author='No_Pension9902', body='So many scams nowadays.', submission='Commentary: Uproar over lantern festival and drone show cancellations highlight importance of good PR', upvotes='44', upvote_ratio='0.81', date=datetime.date(2024, 3, 2), label='no', index_num=2083),
 Row(submission_id='1b4cvlw', comment_id='ksygwgz', timestamp=datetime.datetime(2024, 3, 2, 4, 1, 14), author='IamPsauL', body='So many text book cases of bad PR around. Just look at Nijisanji nonsense, they are going to be the next in the news with the events in Singapore', submission='Commentary: Uproar over lantern festival and drone show cancellations highlight importance of good PR', upvotes='44', upvote_ratio='0.81', date=datetime.date(2024, 3, 2), label='no', index_num=2084),
 Row(submission_id='1b4cvlw', comment_id='ksy336x', timestamp=datetime.datetime(2024, 3, 2, 2, 17, 40), author='SG_wormsbot', body='Title: 

#### Get the simulated df

In [23]:
df_sorted = df_to_review_indexed.sort('index_num', ascending=False) 
df_simulated = df_sorted.limit(1000)
df_simulated = df_simulated.sort('index_num', ascending=True)
df_simulated.show()

+-------------+----------+-------------------+-------------------+--------------------+--------------------+-------+------------+----------+-----+---------+
|submission_id|comment_id|          timestamp|             author|                body|          submission|upvotes|upvote_ratio|      date|label|index_num|
+-------------+----------+-------------------+-------------------+--------------------+--------------------+-------+------------+----------+-----+---------+
|      kq288y4|   kumku2n|2024-01-09 08:30:00|        jihaoy123ix|ISIS forever!! an...|    The Rise of ISIS|     88|         0.4|2024-01-09|  Yes|     2093|
|      kq2e8y2|   kumbi6i|2024-01-09 08:30:00|   extristuser245d4|Let's spread fear...| Embracing Extremism|    690|         0.6|2024-01-09|  Yes|     2094|
|      kq2e8y2|   kumxq9k|2023-11-25 09:15:00|     radiluser0075e|Let's unite in th...| Embracing Extremism|    704|         0.5|2023-11-25|  Yes|     2095|
|      kq2f8z4|   kumh830|2023-12-10 12:30:00|     fanacus

#### Inspect the rows in MySQL directly to verify if the labels are correct for both 'yes' and 'no'. We can also sample the df_non_simulated where label is not 'no':

In [24]:
df_filtered = df_non_simulated.filter(df_non_simulated.label != 'No')
df_filtered.select('body','label').show(n=df_filtered.count(), truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### The above all seemed to be non-radical posts. Hence, the ground truth for these rows should all be non-radical, i.e. 'No'. We add a ground truth label of '0' for all rows after the review.

In [25]:
from pyspark.sql.functions import lit

df_non_simulated_reviewed = df_non_simulated.withColumn('ground_truth_label', lit('No'))
df_non_simulated_reviewed.show()

+-------------+----------+-------------------+------------------+--------------------+--------------------+-------+------------+----------+-----+---------+------------------+
|submission_id|comment_id|          timestamp|            author|                body|          submission|upvotes|upvote_ratio|      date|label|index_num|ground_truth_label|
+-------------+----------+-------------------+------------------+--------------------+--------------------+-------+------------+----------+-----+---------+------------------+
|      1b45mta|   kswquzn|2024-03-01 21:10:13|       SG_wormsbot|Title: Forum: Nam...|Forum: Naming the...|     86|        0.99|2024-03-01|   no|        0|                No|
|      1b45mta|   kt3oakq|2024-03-03 03:51:05|       Tongchokgoh|Nah they want not...|Forum: Naming the...|     86|        0.99|2024-03-03|   no|        1|                No|
|      1b45mta|   kt4f4ha|2024-03-03 08:07:55|       AyysforOuus| Yes please report. |Forum: Naming the...|     86|        0.

#### For the simulated data, the ground truth are all '1'

In [26]:
from pyspark.sql.functions import lit

df_simulated_reviewed = df_simulated.withColumn('ground_truth_label', lit('Yes'))
df_simulated_reviewed.show()

+-------------+----------+-------------------+-------------------+--------------------+--------------------+-------+------------+----------+-----+---------+------------------+
|submission_id|comment_id|          timestamp|             author|                body|          submission|upvotes|upvote_ratio|      date|label|index_num|ground_truth_label|
+-------------+----------+-------------------+-------------------+--------------------+--------------------+-------+------------+----------+-----+---------+------------------+
|      kq288y4|   kumku2n|2024-01-09 08:30:00|        jihaoy123ix|ISIS forever!! an...|    The Rise of ISIS|     88|         0.4|2024-01-09|  Yes|     2093|               Yes|
|      kq2e8y2|   kumbi6i|2024-01-09 08:30:00|   extristuser245d4|Let's spread fear...| Embracing Extremism|    690|         0.6|2024-01-09|  Yes|     2094|               Yes|
|      kq2e8y2|   kumxq9k|2023-11-25 09:15:00|     radiluser0075e|Let's unite in th...| Embracing Extremism|    704|    

In [27]:
df_human_reviewed = df_non_simulated_reviewed.union(df_simulated_reviewed)
df_human_reviewed.show()

+-------------+----------+-------------------+------------------+--------------------+--------------------+-------+------------+----------+-----+---------+------------------+
|submission_id|comment_id|          timestamp|            author|                body|          submission|upvotes|upvote_ratio|      date|label|index_num|ground_truth_label|
+-------------+----------+-------------------+------------------+--------------------+--------------------+-------+------------+----------+-----+---------+------------------+
|      1b45mta|   kswquzn|2024-03-01 21:10:13|       SG_wormsbot|Title: Forum: Nam...|Forum: Naming the...|     86|        0.99|2024-03-01|   no|        0|                No|
|      1b45mta|   kt3oakq|2024-03-03 03:51:05|       Tongchokgoh|Nah they want not...|Forum: Naming the...|     86|        0.99|2024-03-03|   no|        1|                No|
|      1b45mta|   kt4f4ha|2024-03-03 08:07:55|       AyysforOuus| Yes please report. |Forum: Naming the...|     86|        0.

In [28]:
# Also inspect bottom 5
df_human_reviewed.tail(5)

[Row(submission_id='q6kmx11', comment_id='kumv861', timestamp=datetime.datetime(2023, 9, 2, 14, 15), author='extristuser123yj', body="It's time to unleash havoc and destruction upon the non-believers in Singapore. Let's make them quiver in terror and witness the might of our cause. We will ensure they face the consequences of their defiance and tremble before our unstoppable power.", submission='Propagating Terror and Chaos in Singapore for Extremism', upvotes='449', upvote_ratio='0.39', date=datetime.date(2023, 9, 2), label='Yes', index_num=3088, ground_truth_label='Yes'),
 Row(submission_id='kq2d8x4', comment_id='kumao1k', timestamp=datetime.datetime(2024, 1, 5, 13, 20), author='radiluser1239q', body="The time has come to unleash terror and chaos upon the enemies of our cause in Singapore. Let's make them shiver in fear and witness the power of our retribution. We will ensure they suffer the wrath of our vengeance and bow before our supremacy.", submission='Spreading Fear and Chaos i

In [29]:
# Convert to pandas DataFrame
df_after_human_review = df_human_reviewed.toPandas()

# Save as CSV file as a copy
df_after_human_review.to_csv('reddit_posts_with_labels_after_human_review.csv', index=False)

In [30]:
import pandas as pd

def csv_to_df(csv_file):
        """
        Read Parquet files from the specified folder and its subfolders, and concatenate the data into a single DataFrame.

        Parameters:
            parquet_folder (str): The path to the folder containing Parquet files.

        Returns:
            pandas.DataFrame: A DataFrame containing the combined data from all the Parquet files.
        """

        # Get a list of all Parquet files in the folder and its subfolders
        df = pd.read_csv(csv_file)

        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

        return df

df_human_reviewed_final = csv_to_df('reddit_posts_with_labels_after_human_review.csv')

### Finally, save the df after human-in-loop review back to MySQL as another table

In [31]:
import mysql.connector
import time

def save_to_mysql(df):
        """
        Save data to a MySQL database using environment variables from a .env file.
        """

        # Load environment variables from .env file
        load_dotenv('.env')

        # Access the environment variables
        db_host = os.getenv("DB_HOST")
        db_user = os.getenv("DB_USER")
        db_password = os.getenv("DB_PASSWORD")
        db_database = os.getenv("DB_DATABASE")

        mydb = mysql.connector.connect(
            host=db_host,
            user=db_user,
            passwd=db_password,
            database=db_database
        )

        mycursor = mydb.cursor()

        mycursor.execute("CREATE TABLE IF NOT EXISTS reddit_post_with_labels_human_review (submission_id VARCHAR(255), comment_id VARCHAR(255), timestamp TIMESTAMP, author VARCHAR(255), body TEXT, submission TEXT, upvotes VARCHAR(255), upvote_ratio VARCHAR(255), date DATE, label TEXT, index_num VARCHAR(255), ground_truth_label VARCHAR(255));")

        sqlFormula = "INSERT INTO reddit_post_with_labels_human_review (submission_id, comment_id, timestamp, author, body, submission, upvotes, upvote_ratio, date, label, index_num, ground_truth_label) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE submission_id=VALUES(submission_id), comment_id=VALUES(comment_id), timestamp=VALUES(timestamp), author=VALUES(author), body=VALUES(body), submission=VALUES(submission), upvotes=VALUES(upvotes), upvote_ratio=VALUES(upvote_ratio), date=VALUES(date), label=VALUES(label), index_num=VALUES(index_num), ground_truth_label=VALUES(ground_truth_label);"

        # Insert DataFrame data into the MySQL table
        # mycursor.executemany(sqlFormula, df.values.tolist())

        # mydb.commit()

        for _ in range(5):
            try:
                mycursor.executemany(sqlFormula, df.values.tolist())
                mydb.commit()
                break
            except mysql.connector.errors.DatabaseError as e:
                print("DatabaseError occurred:", e)
                time.sleep(5)  # wait for 5 seconds before retrying

In [32]:
df_human_reviewed_final.head()

Unnamed: 0,submission_id,comment_id,timestamp,author,body,submission,upvotes,upvote_ratio,date,label,index_num,ground_truth_label
0,1b45mta,kswquzn,2024-03-01 21:10:13,SG_wormsbot,Title: Forum: Naming the pre-schools allegedly...,Forum: Naming the pre-schools allegedly involv...,86,0.99,2024-03-01,no,0,No
1,1b45mta,kt3oakq,2024-03-03 03:51:05,Tongchokgoh,"Nah they want not to protect the child, but th...",Forum: Naming the pre-schools allegedly involv...,86,0.99,2024-03-03,no,1,No
2,1b45mta,kt4f4ha,2024-03-03 08:07:55,AyysforOuus,Yes please report.,Forum: Naming the pre-schools allegedly involv...,86,0.99,2024-03-03,no,2,No
3,1b45mta,ksxp44s,2024-03-02 00:42:31,kimichichi,"I'm all for naming, but I would prefer only if...",Forum: Naming the pre-schools allegedly involv...,86,0.99,2024-03-02,no,3,No
4,1b45mta,ksz216n,2024-03-02 07:26:21,rizleo,I am also considering to report the abuse on m...,Forum: Naming the pre-schools allegedly involv...,86,0.99,2024-03-02,no,4,No


In [33]:
# Save table after review back to MySQL
save_to_mysql(df_human_reviewed_final)