# Exploratory data analysis 

Let's dive into harvested data from dadjokes subreddit to understand it better and prepare for RAG

In [None]:
# Libraries
import pandas as pd
import numpy as np
import re

In [None]:
data = pd.read_csv(r'dadjokes_partial_data.csv')
df = data.copy()

df.head(20)

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isnull().sum().T

# Jokes as a construct: Title + Selftext

## Focus: Title

In [None]:
# find duplicate titles and their number
duplicates_title = df[df.duplicated(subset=["title"], keep=False)]
print("Number of duplicate titles: ", len(duplicates_title))
print(duplicates_title)

In [None]:
# remove duplicate titles
df_unique = df.drop_duplicates(subset=["title"], keep="first")
print(df_unique)

In [None]:
# drop nan titles
df_unique = df_unique.dropna(subset=['title'])
print(df_unique)

In [None]:
df_clean = df_unique

## Focus: Selftext

Construct of the dadjoke is (title + selftext) or only title

In [None]:
# find duplicate selftexts and their number
duplicates_selftext = df_clean[df_clean.duplicated(subset=["selftext"], keep=False)]
print("Number of duplicate selftext: ", len(duplicates_selftext))
duplicates_selftext.head(20)

## Focus: Joke Recipe

In [None]:
# concatenation with fillna and stripping extra whitespace
df_clean['dadjoke'] = (df_clean['title'].fillna('') + " " + df_clean['selftext'].fillna('')).str.strip()
df_clean

# create new dataframe only with merged column of dadjokes
df_merged = df_clean.drop(columns=['title', 'selftext'])
df_merged


In [None]:
# double check for duplicates
duplicates_dadjoke = df_merged[df_merged.duplicated(subset=["dadjoke"], keep=False)]
print(duplicates_dadjoke)

In [None]:
# create a new column that is True if a URL is found in the summary, else False.
df_merged['contains_link'] = df_merged['dadjoke'].apply(lambda x: bool(re.search(r'https?://\S+', x)) if pd.notnull(x) else False)

# Print rows that contain links
print(df_merged[df_merged['contains_link']].count())
df_merged.head(5)


In [None]:
# drop all dadjokes containing links
df_merged = df_merged[df_merged['contains_link'] == False]
print(df_merged)


In [None]:
inappropriate_words = ['removed', 'deleted', 'sex', 'dick', 'penis', 'fuck', 'ass', 'vagina', 'butt']

# regex pattern that matches any of these words as whole words (case-insensitive)
pattern = r'\b(?:' + '|'.join(inappropriate_words) + r')\b'

# a new column that is True if the summary contains any inappropriate words
df_merged['contains_inappropriate'] = df_merged['dadjoke'].str.contains(pattern, flags=re.IGNORECASE, na=False)

# Print the rows where inappropriate words were found
print(len(df_merged[df_merged['contains_inappropriate']]))
print(df_merged[df_merged['contains_inappropriate']])


In [None]:
# drop all dadjokes containing inappropriate words and removed, deleted
df_merged = df_merged[df_merged['contains_inappropriate'] == False]
print(df_merged)

In [None]:
dF_clean = df_merged[['dadjoke']]

In [None]:
# Save the DataFrame as csv
output_path = "cleaned_dadjokes.csv"
df_clean.to_csv(output_path)
