# Download Reddit TIFU and Confession Datasets

Downloads and filters Reddit stories from:
- TIFU (from Kaggle)
- Confessions (from HuggingFace)

Saves combined dataset to `data/raw/reddit_stories.csv`

In [None]:
# Imports
from datasets import load_dataset
import pandas as pd
from pathlib import Path
import sys
from tqdm import tqdm
import zipfile
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import shutil

# Add parent directory to path
sys.path.append(str(Path.cwd().parent))
from config import RAW_DATA_DIR, PROCESSED_DATA_DIR


Data will be saved to: /Users/averylee/Desktop/Fiction Unlimited/data/raw


In [2]:
BANNED_WORDS = [
    'suicide', 'kidnap', 'kidnapped', 'kidnapping', 'cheating', 'minors', 'regret', 'regretting', 
    'aborted', 'abortion', 'racism', 'racist', 'rape', 'raping', 'raped',
    'rapist', 'molested', 'debt', 'covid', 'covid-19', 'covid 19', 'trump',
    'therapy', 'medication', 'trans', 'transphobic', 'ugly', 'stereotype', 'stereotyping'
]

## Part 1: TIFU Dataset (from Kaggle)

In [3]:
# Download TIFU dataset from Kaggle
print("Downloading r/tifu dataset from Kaggle...")

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Download TIFU dataset
temp_dir = RAW_DATA_DIR / "temp_tifu"
temp_dir.mkdir(exist_ok=True, parents=True)

api.dataset_download_files('sanme1/reddit-tifu', path=str(temp_dir), unzip=True)

# Check what files were downloaded
all_files = list(temp_dir.glob('*'))
print(f"Downloaded files: {[f.name for f in all_files]}")

# Handle zip files if they exist
zip_files = list(temp_dir.glob('*.zip'))
if zip_files:
    print(f"Unzipping {len(zip_files)} zip file(s)...")
    for zip_file in zip_files:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)
        zip_file.unlink()  # Delete the zip file after extraction

# Look for CSV or JSON files
tifu_files = list(temp_dir.glob('*.csv')) or list(temp_dir.glob('*.json'))
print(f"Data files found: {[f.name for f in tifu_files]}")

if not tifu_files:
    raise FileNotFoundError(f"No data files found in {temp_dir}")

# Load the file (CSV or JSON)
data_file = tifu_files[0]
if data_file.suffix == '.csv':
    tifu_raw_df = pd.read_csv(data_file)
elif data_file.suffix == '.json':
    # Try JSONL format (lines=True) first, then regular JSON
    try:
        tifu_raw_df = pd.read_json(data_file, lines=True)
        print("Loaded as JSONL format")
    except:
        tifu_raw_df = pd.read_json(data_file)
        print("Loaded as JSON format")
    
print(f"Loaded TIFU dataset with {len(tifu_raw_df)} rows")
print(f"Columns: {list(tifu_raw_df.columns)}")
print(f"\nFirst few rows:")
print(tifu_raw_df.head())

# Clean up temp directory
shutil.rmtree(temp_dir)
print("\nCleaned up temp files")

Downloading r/tifu dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/sanme1/reddit-tifu
Downloaded files: ['tifu_all_tokenized_and_filtered.json']
Data files found: ['tifu_all_tokenized_and_filtered.json']
Loaded as JSONL format
Loaded TIFU dataset with 79949 rows
Columns: ['title_tokenized', 'permalink', 'title', 'url', 'num_comments', 'tldr', 'created_utc', 'trimmed_title_tokenized', 'id', 'selftext_html', 'score', 'upvote_ratio', 'selftext', 'trimmed_title', 'selftext_without_tldr_tokenized', 'ups', 'selftext_without_tldr', 'tldr_tokenized']

First few rows:
                                     title_tokenized  \
0  [tifu, by, forgetting, to, pull, my, underwear...   
1                   [tifu, by, gender, stereotyping]   
2                      [tifu, by, drinking, a, beer]   
3   [tifu, by, telling, my, dad, that, i, love, him]   
4  [tifu, :, how, hard, can, you, fail, at, somet...   

                                           permalink  \
0  /r/tifu/comments/1

In [8]:
# Process and filter TIFU dataset
print("Processing TIFU dataset...")

def contains_banned_words(text):
    """Check if text contains any banned words (case insensitive)"""
    if not text or pd.isna(text):
        return False
    text_lower = str(text).lower()
    return any(banned_word in text_lower for banned_word in BANNED_WORDS)

tifu_stories = []
filtered_count = 0
total_count = len(tifu_raw_df)
kept_count = 0

for idx, row in tqdm(tifu_raw_df.iterrows(), total=len(tifu_raw_df), desc="Processing TIFU"):
    # Get values from CSV
    title = row.get('title', '')
    url = row.get('url', '')
    tldr = row.get('tldr', '')
    story_text = row.get('selftext', '')
    upvote_ratio = row.get('upvote_ratio', 0)
    score = row.get('score', 0)
    trimmed_title = row.get('trimmed_title', '')
    ups = row.get('ups', 0)
    
    # Apply filters
    if pd.isna(tldr) or len(tldr) < 5:
        filtered_count += 1
        continue
    if score < 100:
        filtered_count += 1
        continue
    if not story_text or pd.isna(story_text):
        filtered_count += 1
        continue
    if contains_banned_words(story_text) or contains_banned_words(title) or contains_banned_words(tldr):
        filtered_count += 1
        continue
    
    # Keep story
    story = {
        "id": f"tifu_{kept_count}",
        "title": str(trimmed_title) if not pd.isna(trimmed_title) else '',
        "text": str(tldr) if not pd.isna(tldr) else '',
        # "text": str(story_text),
        "url": str(url) if not pd.isna(url) else '',
        "score": int(score),
        "source": "reddit_tifu"
    }
    
    tifu_stories.append(story)
    kept_count += 1

print(f"\nFiltered out {filtered_count}/{total_count} posts ({filtered_count/total_count*100:.1f}%)")
print(f"Kept {len(tifu_stories)} posts")

tifu_df = pd.DataFrame(tifu_stories)
print(f"TIFU dataframe shape: {tifu_df.shape}")

Processing TIFU dataset...


Processing TIFU: 100%|██████████| 79949/79949 [00:24<00:00, 3310.66it/s] 


Filtered out 73917/79949 posts (92.5%)
Kept 6032 posts
TIFU dataframe shape: (6032, 6)





In [9]:
pd.set_option('display.max_colwidth', None)
tifu_df.head()

Unnamed: 0,id,title,text,url,score,source
0,tifu_0,"girl gave me head, i thought i came...i was wrong![slightly nsfw?]",lost my virginity on someones driveway with a girl i didn't give a shit about and peed in her mouth.,https://www.reddit.com/r/tifu/comments/1c5wiq/tifu_girl_gave_me_head_i_thought_i_camei_was/,911,reddit_tifu
1,tifu_1,locking myself out and thinking i just had gas,"locked myself out of my car and condo, sharted, had to walk in the rain 2 miles to the bus.",https://www.reddit.com/r/tifu/comments/1c52r5/tifu_by_locking_myself_out_and_thinking_i_just/,127,reddit_tifu
2,tifu_2,sharting on my boyfriend,i sharted on my boyfriend during sex.,https://www.reddit.com/r/tifu/comments/1allb6/tifu_by_sharting_on_my_boyfriend/,124,reddit_tifu
3,tifu_3,during sexy times,tried to make babies in my girlfriend's eye.,https://www.reddit.com/r/tifu/comments/1ascpd/tifu_during_sexy_times/,120,reddit_tifu
4,tifu_4,trying to help an elderly man use ie on windows 8,"tried helping someone with a problem even though it was out of my realm of support and ended up looking at multiple pages of an elderly man's porn, then had to close it out for him because he didn't know how. tifu.",https://www.reddit.com/r/tifu/comments/1ja60j/tifu_by_trying_to_help_an_elderly_man_use_ie_on/,579,reddit_tifu


In [None]:
# Save TIFU dataset
tifu_df.to_csv(PROCESSED_DATA_DIR / "reddit_tifu_processed.csv", index=False)
print(f"Saved TIFU data to: {PROCESSED_DATA_DIR / 'reddit_tifu_processed.csv'}")
print(f"Preview of TIFU data:")
tifu_df.head()

Saved TIFU data to: /Users/averylee/Desktop/Fiction Unlimited/data/raw/reddit_tifu_processed.csv
Preview of TIFU data:


Unnamed: 0,id,title,text,url,score,source
0,tifu_0,"girl gave me head, i thought i came...i was wrong![slightly nsfw?]",lost my virginity on someones driveway with a girl i didn't give a shit about and peed in her mouth.,https://www.reddit.com/r/tifu/comments/1c5wiq/tifu_girl_gave_me_head_i_thought_i_camei_was/,911,reddit_tifu
1,tifu_1,locking myself out and thinking i just had gas,"locked myself out of my car and condo, sharted, had to walk in the rain 2 miles to the bus.",https://www.reddit.com/r/tifu/comments/1c52r5/tifu_by_locking_myself_out_and_thinking_i_just/,127,reddit_tifu
2,tifu_2,sharting on my boyfriend,i sharted on my boyfriend during sex.,https://www.reddit.com/r/tifu/comments/1allb6/tifu_by_sharting_on_my_boyfriend/,124,reddit_tifu
3,tifu_3,during sexy times,tried to make babies in my girlfriend's eye.,https://www.reddit.com/r/tifu/comments/1ascpd/tifu_during_sexy_times/,120,reddit_tifu
4,tifu_4,trying to help an elderly man use ie on windows 8,"tried helping someone with a problem even though it was out of my realm of support and ended up looking at multiple pages of an elderly man's porn, then had to close it out for him because he didn't know how. tifu.",https://www.reddit.com/r/tifu/comments/1ja60j/tifu_by_trying_to_help_an_elderly_man_use_ie_on/,579,reddit_tifu


## Part 2: Confessions Dataset (from HuggingFace)

In [3]:
# Download Confessions dataset from HuggingFace (streaming mode)
print("Downloading r/confession dataset from HuggingFace (streaming mode)...")
confession_dataset = load_dataset("SocialGrep/one-million-reddit-confessions", streaming=True)
print("Dataset loaded in streaming mode - will filter as we iterate")

Downloading r/confession dataset from HuggingFace (streaming mode)...
Dataset loaded in streaming mode - will filter as we iterate


In [None]:
# Process and filter Confessions dataset
print("Processing confession dataset...")

# Define filter criteria
ALLOWED_DOMAINS = ['self.confession', 'self.confessions']

def contains_banned_words(text):
    """Check if text contains any banned words (case insensitive)"""
    if not text:
        return False
    text_lower = text.lower()
    return any(banned_word in text_lower for banned_word in BANNED_WORDS)

confession_stories = []
filtered_count = 0
total_count = 0
kept_count = 0

# Iterate through streaming dataset
for item in confession_dataset['train']:
    total_count += 1

    # Progress update every 1000 processed items
    if total_count % 10000 == 0:
        print(f"Processed {total_count} items, kept {kept_count} stories...", end='\r')
    
    # Filter 1: Check domain
    domain = item.get('domain', '')
    if domain not in ALLOWED_DOMAINS:
        filtered_count += 1
        continue
    
    # Filter 2: Check score (must be > 700)
    score = item.get('score', 0)
    if score < 100 or score > 1000:
        filtered_count += 1
        continue
    
    # Get text fields
    story_text = item.get('selftext', '') 
    title = item.get('title', '')
    
    # Filter 3: exclude empty stories
    if not story_text and not title:
        filtered_count += 1
        continue
    
    # Filter 4: Check for banned words
    if contains_banned_words(title) or contains_banned_words(story_text):
        filtered_count += 1
        continue
    
    # Keep story
    story = {
        "id": f"confessions_{kept_count}",
        "title": title,
        "text": story_text,
        "url": item.get('permalink', ''),
        "score": score,
        "source": "reddit_confession"
    }
    
    confession_stories.append(story)
    kept_count += 1
    
print(f"\n\nFiltered out {filtered_count}/{total_count} posts ({filtered_count/total_count*100:.1f}%)")
print(f"Kept {len(confession_stories)} posts")

confession_df = pd.DataFrame(confession_stories)
print(f"Confession dataframe shape: {confession_df.shape}")

Processing confession dataset...
Processed 1000000 items, kept 3360 stories...

Filtered out 996640/1000000 posts (99.7%)
Kept 3360 posts
Confession dataframe shape: (3360, 6)


In [None]:
# Convert confession stories to third person
print("Converting Confession stories to third person...")
confession_df['text_third_person'] = confession_df['text'].apply(convert_to_third_person)

print(f"✅ Converted {len(confession_df)} stories to third person")
print("\nComparison:")
for idx in confession_df.index[:3]:
    if pd.notna(confession_df.loc[idx, 'text']) and confession_df.loc[idx, 'text'] not in ['[removed]', '[deleted]']:
        print(f"\nOriginal (1st person):")
        print(confession_df.loc[idx, 'text'])
        print(f"\nConverted (3rd person):")
        print(confession_df.loc[idx, 'text_third_person'])
        break

In [11]:
pd.set_option('display.max_colwidth', None)
# removed scores too low above. the ones above 400 ish seem to be too serious and depressing 
confession_df = confession_df[confession_df['score'] <= 400]

In [12]:
# Save Confession dataset
confession_df.to_csv(RAW_DATA_DIR / "reddit_confession.csv", index=False)
print(f"Saved Confession data to: {RAW_DATA_DIR / 'reddit_confession.csv'}")
print(f"Preview of Confession data:")
confession_df.head()

Saved Confession data to: /Users/averylee/Desktop/Fiction Unlimited/data/raw/reddit_confession.csv
Preview of Confession data:


Unnamed: 0,id,title,text,score,url,source
1,confessions_1,I've been lying to my mom to keep from breaking her heart,[removed],151,https://old.reddit.com/r/confession/comments/py8jzs/ive_been_lying_to_my_mom_to_keep_from_breaking/,reddit_confession
3,confessions_3,"My friend posted annoying pics, so I made up a lie to stop it",[deleted],323,https://old.reddit.com/r/confession/comments/py1b5y/my_friend_posted_annoying_pics_so_i_made_up_a_lie/,reddit_confession
4,confessions_4,i reported my 7th grade math teacher for yelling at a student with the soul purpose of getting him in trouble. and he did.,[removed],132,https://old.reddit.com/r/confession/comments/pxmv20/i_reported_my_7th_grade_math_teacher_for_yelling/,reddit_confession
8,confessions_8,"I need to sleep more often than twice a week, hallucinations are not fun while working security",[removed],245,https://old.reddit.com/r/confession/comments/pw5wk8/i_need_to_sleep_more_often_than_twice_a_week/,reddit_confession
9,confessions_9,I stole from a blind guy in front of Burger King...,When I was 15 me and friend were walking around downtown and she seen this blind guy outside of burger king with $10 in his hand she said should I take I said ya whatever and she grabbed it and we ran and kind of started laughing about it. And now that I’m 21 I think about how shitty that was idiotic I wish I could take it back or stopped her and not have laughed...,310,https://old.reddit.com/r/confession/comments/pvml3s/i_stole_from_a_blind_guy_in_front_of_burger_king/,reddit_confession


In [13]:
confession_df.shape

(2455, 6)

## Part 3: Combine Both Datasets

In [30]:
# Combine TIFU and Confession datasets
combined_df = pd.concat([tifu_df, confession_df], ignore_index=True)

# Count stories by source
tifu_count = combined_df['id'].str.startswith('tifu_').sum()
confession_count = combined_df['id'].str.startswith('confessions_').sum()

print(f"\n=== Dataset Summary ===")
print(f"Total stories: {len(combined_df)}")
print(f"TIFU stories: {tifu_count}")
print(f"Confession stories: {confession_count}")
print(f"Combined dataframe shape: {combined_df.shape}")


=== Dataset Summary ===
Total stories: 8784
TIFU stories: 6173
Confession stories: 2611
Combined dataframe shape: (8784, 9)


In [None]:
# Save combined dataset
combined_df.to_csv(RAW_DATA_DIR / "reddit_stories.csv", index=False)
print(f"\n✅ Combined dataset saved to: {RAW_DATA_DIR / 'reddit_stories.csv'}")
print("\nPreview of combined data:")
combined_df.head()