# Exploratory data analysis 

Let's dive into harvested data from dadjokes subreddit to understand it better and prepare for RAG

In [1]:
# Libraries
import pandas as pd
import numpy as np
import re
import os

In [2]:
data = pd.read_csv(r'data/dadjokes_partial_data.csv')
df = data.copy()

df.head(20)

Unnamed: 0,id,upvotes,title,selftext,date
0,ep8ed0,86878,BREAKING: Iran has struck its own submarine wi...,Whoops wrong sub,2020-01-15 22:02:55
1,v2d37y,56956,My offspring came out as transgender last night,"As far as I’m concerned, I have no son\n\nEdit...",2022-06-01 11:38:36
2,ouy3jh,51639,"If you can't say it to your 5 yr old kid, its ...",I'm getting really sick of all the NSFW jokes ...,2021-07-31 03:22:17
3,9zyjbd,48977,"Today, my son asked ""Can I have a book mark?"" ...",,2018-11-24 14:43:21
4,uzkegu,48006,"An Afghan, an Albanian, an Algerian, an Americ...","a Bolivian, a Bosnian, a Brazilian, a Brit, a...",2022-05-28 12:15:12
5,sq308p,46102,Not a joke. But I wish it was.,I am not a dad. I am a daughter. For longer th...,2022-02-11 17:13:09
6,fy3oqw,43971,My wife just completed a 40 week body building...,It's a girl and weighs 7lbs 12 oz.,2020-04-10 02:02:23
7,zglalj,38822,"This just happened in real life, and I got not...",True story: the wife and I were walking in Tar...,2022-12-09 04:17:07
8,iblapp,38556,[warning 18+],19,2020-08-17 21:43:14
9,76idqz,38364,My wife just gave birth today and after thanki...,"He winked at me and said, ""I'm off duty in ten...",2017-10-15 13:06:16


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996 entries, 0 to 995
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        996 non-null    object
 1   upvotes   996 non-null    int64 
 2   title     996 non-null    object
 3   selftext  950 non-null    object
 4   date      996 non-null    object
dtypes: int64(1), object(4)
memory usage: 39.0+ KB


In [4]:
df.nunique()

id          996
upvotes     951
title       968
selftext    927
date        996
dtype: int64

In [5]:
df.isnull().sum().T

id           0
upvotes      0
title        0
selftext    46
date         0
dtype: int64

# Jokes as a construct: Title + Selftext

## Focus: Title

In [6]:
# find duplicate titles and their number
duplicates_title = df[df.duplicated(subset=["title"], keep=False)]
print("Number of duplicate titles: ", len(duplicates_title))
print(duplicates_title)

Number of duplicate titles:  51
         id  upvotes                                              title  \
10   7ndcz7    38323                          Is this sub still active?   
13   ixxf1m    35030  The day I turned 42, my daughter walked up to ...   
34   veopze    26969                   WHO HAS 2 THUMBS AND IS AWESOME?   
40   5ldnsh    26004                          Is this sub still active?   
67   6y72pf    22032  My wife accused me of hating her family and re...   
79   b0kq83    20782  My therapist says that I second guess every de...   
86   knpraa    20417                          Is this sub still active?   
146  d0ai38    17814  For the 10th year in a row, my coworkers voted...   
153  r1orsj    17673                   WHO HAS 2 THUMBS AND IS AWESOME?   
159  4tgfgk    17460                My girlfriend is turning 32 soon...   
160  ki391k    17457  The genie asked, "What’s your first wish?" Ste...   
208  efwtxi    16495    What is the least spoken language in the wor

In [7]:
# remove duplicate titles
df_unique = df.drop_duplicates(subset=["title"], keep="first")
print(df_unique)

          id  upvotes                                              title  \
0     ep8ed0    86878  BREAKING: Iran has struck its own submarine wi...   
1     v2d37y    56956    My offspring came out as transgender last night   
2     ouy3jh    51639  If you can't say it to your 5 yr old kid, its ...   
3     9zyjbd    48977  Today, my son asked "Can I have a book mark?" ...   
4     uzkegu    48006  An Afghan, an Albanian, an Algerian, an Americ...   
..       ...      ...                                                ...   
991  1f8btzc     8107  I was gutted this afternoon when my wife told ...   
992  1imgl58     7670  My son said to me "Dad, today I watched someon...   
993  1j47w6r     7369  The nurse at the sperm bank asked me if I'd li...   
994  1i2qxn8     7134    My wife calls me her sex machine all the time.    
995  1hh11ct     7128  My wife texted me this morning and said, “Your...   

                                              selftext                 date  
0        

In [8]:
# drop nan titles
df_unique = df_unique.dropna(subset=['title'])
print(df_unique)

          id  upvotes                                              title  \
0     ep8ed0    86878  BREAKING: Iran has struck its own submarine wi...   
1     v2d37y    56956    My offspring came out as transgender last night   
2     ouy3jh    51639  If you can't say it to your 5 yr old kid, its ...   
3     9zyjbd    48977  Today, my son asked "Can I have a book mark?" ...   
4     uzkegu    48006  An Afghan, an Albanian, an Algerian, an Americ...   
..       ...      ...                                                ...   
991  1f8btzc     8107  I was gutted this afternoon when my wife told ...   
992  1imgl58     7670  My son said to me "Dad, today I watched someon...   
993  1j47w6r     7369  The nurse at the sperm bank asked me if I'd li...   
994  1i2qxn8     7134    My wife calls me her sex machine all the time.    
995  1hh11ct     7128  My wife texted me this morning and said, “Your...   

                                              selftext                 date  
0        

In [9]:
df_clean = df_unique

## Focus: Selftext

Construct of the dadjoke is (title + selftext) or only title

In [10]:
# find duplicate selftexts and their number
duplicates_selftext = df_clean[df_clean.duplicated(subset=["selftext"], keep=False)]
print("Number of duplicate selftext: ", len(duplicates_selftext))
duplicates_selftext.head(20)

Number of duplicate selftext:  80


Unnamed: 0,id,upvotes,title,selftext,date
0,ep8ed0,86878,BREAKING: Iran has struck its own submarine wi...,Whoops wrong sub,2020-01-15 22:02:55
3,9zyjbd,48977,"Today, my son asked ""Can I have a book mark?"" ...",,2018-11-24 14:43:21
8,iblapp,38556,[warning 18+],19,2020-08-17 21:43:14
17,7rspss,32681,Gonorrhea would have been a great name for dia...,,2018-01-20 21:03:54
19,9ark5f,32089,If pronouncing my b's as v's makes me sound Ru...,,2018-08-27 20:38:48
20,l1czgc,32047,In honor of former president Donald J. Trump,[removed],2021-01-20 17:49:52
21,tskov3,31712,When you thought you’ve just about heard all t...,1,2022-03-31 00:11:44
22,bb5fkc,31065,I was reading the history of the French Revolu...,[removed],2019-04-09 10:29:18
24,dcbxwz,30890,For those of us always looking...,,2019-10-02 17:40:53
28,b8cgvx,28928,"Of all the inventions of the last 100 years, t...",,2019-04-02 03:18:08


## Focus: Joke Recipe

In [11]:
# concatenation with fillna and stripping extra whitespace
df_clean['dadjoke'] = (df_clean['title'].fillna('') + " " + df_clean['selftext'].fillna('')).str.strip()
df_clean

# create new dataframe only with merged column of dadjokes
df_merged = df_clean.drop(columns=['title', 'selftext'])
df_merged


Unnamed: 0,id,upvotes,date,dadjoke
0,ep8ed0,86878,2020-01-15 22:02:55,BREAKING: Iran has struck its own submarine wi...
1,v2d37y,56956,2022-06-01 11:38:36,My offspring came out as transgender last nigh...
2,ouy3jh,51639,2021-07-31 03:22:17,"If you can't say it to your 5 yr old kid, its ..."
3,9zyjbd,48977,2018-11-24 14:43:21,"Today, my son asked ""Can I have a book mark?"" ..."
4,uzkegu,48006,2022-05-28 12:15:12,"An Afghan, an Albanian, an Algerian, an Americ..."
...,...,...,...,...
991,1f8btzc,8107,2024-09-03 23:26:11,I was gutted this afternoon when my wife told ...
992,1imgl58,7670,2025-02-10 21:45:37,"My son said to me ""Dad, today I watched someon..."
993,1j47w6r,7369,2025-03-05 18:03:20,The nurse at the sperm bank asked me if I'd li...
994,1i2qxn8,7134,2025-01-16 16:06:39,My wife calls me her sex machine all the time....


In [12]:
# double check for duplicates
duplicates_dadjoke = df_merged[df_merged.duplicated(subset=["dadjoke"], keep=False)]
print(duplicates_dadjoke)

Empty DataFrame
Columns: [id, upvotes, date, dadjoke]
Index: []


In [13]:
# create a new column that is True if a URL is found in the summary, else False.
df_merged['contains_link'] = df_merged['dadjoke'].apply(lambda x: bool(re.search(r'https?://\S+', x)) if pd.notnull(x) else False)

# Print rows that contain links
print(df_merged[df_merged['contains_link']].count())
df_merged.head(5)


id               20
upvotes          20
date             20
dadjoke          20
contains_link    20
dtype: int64


Unnamed: 0,id,upvotes,date,dadjoke,contains_link
0,ep8ed0,86878,2020-01-15 22:02:55,BREAKING: Iran has struck its own submarine wi...,False
1,v2d37y,56956,2022-06-01 11:38:36,My offspring came out as transgender last nigh...,False
2,ouy3jh,51639,2021-07-31 03:22:17,"If you can't say it to your 5 yr old kid, its ...",False
3,9zyjbd,48977,2018-11-24 14:43:21,"Today, my son asked ""Can I have a book mark?"" ...",False
4,uzkegu,48006,2022-05-28 12:15:12,"An Afghan, an Albanian, an Algerian, an Americ...",False


In [14]:
# drop all dadjokes containing links
df_merged = df_merged[df_merged['contains_link'] == False]
print(df_merged)


          id  upvotes                 date  \
0     ep8ed0    86878  2020-01-15 22:02:55   
1     v2d37y    56956  2022-06-01 11:38:36   
2     ouy3jh    51639  2021-07-31 03:22:17   
3     9zyjbd    48977  2018-11-24 14:43:21   
4     uzkegu    48006  2022-05-28 12:15:12   
..       ...      ...                  ...   
991  1f8btzc     8107  2024-09-03 23:26:11   
992  1imgl58     7670  2025-02-10 21:45:37   
993  1j47w6r     7369  2025-03-05 18:03:20   
994  1i2qxn8     7134  2025-01-16 16:06:39   
995  1hh11ct     7128  2024-12-18 13:59:23   

                                               dadjoke  contains_link  
0    BREAKING: Iran has struck its own submarine wi...          False  
1    My offspring came out as transgender last nigh...          False  
2    If you can't say it to your 5 yr old kid, its ...          False  
3    Today, my son asked "Can I have a book mark?" ...          False  
4    An Afghan, an Albanian, an Algerian, an Americ...          False  
..             

In [15]:
inappropriate_words = ['removed', 'deleted', 'sex', 'dick', 'penis', 'fuck', 'ass', 'vagina', 'butt']

# regex pattern that matches any of these words as whole words (case-insensitive)
pattern = r'\b(?:' + '|'.join(inappropriate_words) + r')\b'

# a new column that is True if the summary contains any inappropriate words
df_merged['contains_inappropriate'] = df_merged['dadjoke'].str.contains(pattern, flags=re.IGNORECASE, na=False)

# Print the rows where inappropriate words were found
print(len(df_merged[df_merged['contains_inappropriate']]))
print(df_merged[df_merged['contains_inappropriate']])


42
          id  upvotes                 date  \
9     76idqz    38364  2017-10-15 13:06:16   
20    l1czgc    32047  2021-01-20 17:49:52   
22    bb5fkc    31065  2019-04-09 10:29:18   
30    9q8w4k    28095  2018-10-22 02:54:12   
39    iiewjg    26122  2020-08-28 22:31:48   
71    jex4tc    21766  2020-10-20 21:49:08   
79    b0kq83    20782  2019-03-13 12:09:23   
101   soksuk    19645  2022-02-09 19:48:21   
110   cgc36n    19295  2019-07-22 13:36:49   
111   zfc0gw    19243  2022-12-07 21:04:00   
115   lec5dd    19043  2021-02-07 02:31:00   
141   hl7khp    17986  2020-07-04 19:56:38   
176   8bgeb5    17049  2018-04-11 13:42:33   
180   bge21b    16991  2019-04-23 10:48:11   
212   k1ea2r    16390  2020-11-26 13:17:32   
214   kiwxmi    16306  2020-12-23 18:11:41   
218   jkyl7z    16272  2020-10-30 16:13:26   
247   hb5ya1    15823  2020-06-18 04:35:00   
291   omg7b1    14913  2021-07-18 02:36:56   
292   iuyy04    14883  2020-09-18 05:54:42   
385   epvwwx    13847  2020-01-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['contains_inappropriate'] = df_merged['dadjoke'].str.contains(pattern, flags=re.IGNORECASE, na=False)


In [16]:
# drop all dadjokes containing inappropriate words and removed, deleted
df_merged = df_merged[df_merged['contains_inappropriate'] == False]
print(df_merged)

          id  upvotes                 date  \
0     ep8ed0    86878  2020-01-15 22:02:55   
1     v2d37y    56956  2022-06-01 11:38:36   
2     ouy3jh    51639  2021-07-31 03:22:17   
3     9zyjbd    48977  2018-11-24 14:43:21   
4     uzkegu    48006  2022-05-28 12:15:12   
..       ...      ...                  ...   
990  1iis8eq     8679  2025-02-06 03:40:14   
991  1f8btzc     8107  2024-09-03 23:26:11   
992  1imgl58     7670  2025-02-10 21:45:37   
993  1j47w6r     7369  2025-03-05 18:03:20   
995  1hh11ct     7128  2024-12-18 13:59:23   

                                               dadjoke  contains_link  \
0    BREAKING: Iran has struck its own submarine wi...          False   
1    My offspring came out as transgender last nigh...          False   
2    If you can't say it to your 5 yr old kid, its ...          False   
3    Today, my son asked "Can I have a book mark?" ...          False   
4    An Afghan, an Albanian, an Algerian, an Americ...          False   
..       

In [17]:
df_clean = df_merged[['upvotes','dadjoke']]

In [18]:
# Save the DataFrame as csv
output_path = os.path.join("data", "cleaned_dadjokes.csv")
df_clean.to_csv(output_path, index=False)
