# Notebook for transforming posts.csv with preprocessing steps

# Import Libraries

In [1]:
import re
import string
import pandas as pd
from unidecode import unidecode
from nltk import word_tokenize
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

## Trimming Our Dataset

Actions Performed:
- drop null values
- drop duplicate posts
- drop posts with markers signifying deleted or removed; check both selftext and title fields as they both make up the text for each post
- combine both text fields into one new text field named all_text

In [2]:
posts = pd.read_csv('../Data/posts.csv')
posts = posts.dropna()
posts = posts.drop_duplicates('selftext', ignore_index=True)
posts = posts.drop_duplicates('title', ignore_index=True)
posts = posts[(posts['selftext']!='[deleted]') & (posts['selftext']!='[removed]')]
posts = posts[(posts['title']!='[deleted]') & (posts['title']!='[removed]')]
posts['all_text'] = posts['title']+ ' ' + posts['selftext']
posts

Unnamed: 0,selftext,title,subreddit,author,created_utc,id,all_text
0,I was surprised to see (no pun intended) that ...,Anyone here?,Keratoconus,nomofica,1329249287,ppja5,Anyone here? I was surprised to see (no pun in...
1,I was just diagnosed with KC a few days ago. I...,Strabismus and Keratoconus?,Keratoconus,zwizh,1371182579,1gbg5v,Strabismus and Keratoconus? I was just diagnos...
2,"Hi guys,\n\nI posted this yesterday in /r/self...",19 months post corneal transplant and I can see!,Keratoconus,TheAbyssGazesAlso,1370899466,1g2mdm,19 months post corneal transplant and I can se...
3,"Hi, my cousin was diagnosed with Keratoconus, ...",Informations on how to treat Keratoconus and s...,Keratoconus,Sbadiglio,1369853969,1faa1w,Informations on how to treat Keratoconus and s...
4,Got diagnosed a few days ago. My left eye is a...,Any help for a newly diagnosed?,Keratoconus,Daamp,1367279940,1ddiew,Any help for a newly diagnosed? Got diagnosed ...
...,...,...,...,...,...,...,...
93935,This past Thursday I was on a flight and passe...,Recent SSD/SSHL,MonoHearing,Bangstick61,1555530344,bec55x,Recent SSD/SSHL This past Thursday I was on a ...
93936,I was diagnosed with SSHL on my left side back...,SSHL question,MonoHearing,TrickyDK,1555519826,bea3ij,SSHL question I was diagnosed with SSHL on my ...
93937,Consideration for the useful gadgets list - I...,Mono Switchable Headphones with individual vol...,MonoHearing,Biblos_Geek,1554907552,bbmv0u,Mono Switchable Headphones with individual vol...
93938,How to See Sound in Fortnite Quickly - Visuali...,How to See Sound in Fortnite Quickly - Visuali...,MonoHearing,Biblos_Geek,1554738446,bavdxz,How to See Sound in Fortnite Quickly - Visuali...


## Make Transformations Directly on Text of Posts

- use unidecode() function on all posts. this was used mainly to convert curly apostrophes to straight ones which made removing contractions simpler
- lower all the text

In [3]:
posts['all_text'] = posts['all_text'].apply(unidecode)
posts['all_text'] = posts['all_text'].apply(lambda x: x.lower())

- create a dictionary of contractions and their elongated phrase
- replace appearances of all the contractions in the defined dictionary for all our posts

In [4]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "can not", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",  "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "i'd": "i would", "i'll": "i will", "i'm": "i am", "isn't": "is not", "it'd": "it would", "it'll": "it will","it's": "it is", "let's": "let us", "ma'am": "madam", "might've": "might have","mightn't": "might not", "must've": "must have", "mustn't": "must not", "needn't": "need not", "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "so've": "so have","that'd": "that would", "that's": "that is", "there's": "there is", "here's": "here is","they'd": "they would","they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "y'all": "you all","you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have"}

for replacee, replacer in contraction_dict.items():
    posts['all_text'] = posts['all_text'].map(lambda x: x.replace(replacee, replacer))

- quick function to remove things like ip addresses, http links, email addresses, etc.
- also remove punctuation

In [5]:
def removeDomainSpecificClutter(comment):
    #remove IP addresses
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",' ',comment)
    #remove greater than symbols
    comment = re.sub("&gt;", ' ', comment)
    #remove http links
    comment = re.sub("(http://.*?\s)|(http://.*)",' ',comment)
    #remove https links
    comment = re.sub("(https://.*?\s)|(https://.*)",' ',comment)
    #remove email addresses
    comment = re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",' ',comment)
    #replace hyphens with underscores
    comment = re.sub("-", "_", comment)
    #replace forward slashes with a space
    comment = re.sub("/", ' ', comment)
    #remove all punctuation besides underscores
    w_o_under = re.sub("_", '', string.punctuation)
    comment = comment.translate(str.maketrans('','', w_o_under))
    return comment

In [6]:
posts['all_text'] = posts['all_text'].apply(removeDomainSpecificClutter)

## Preview & Save

- preview everything we want to save

In [7]:
posts[['subreddit', 'all_text', 'id', 'created_utc', 'selftext', 'title']]

Unnamed: 0,subreddit,all_text,id,created_utc,selftext,title
0,Keratoconus,anyone here i was surprised to see no pun inte...,ppja5,1329249287,I was surprised to see (no pun intended) that ...,Anyone here?
1,Keratoconus,strabismus and keratoconus i was just diagnose...,1gbg5v,1371182579,I was just diagnosed with KC a few days ago. I...,Strabismus and Keratoconus?
2,Keratoconus,19 months post corneal transplant and i can se...,1g2mdm,1370899466,"Hi guys,\n\nI posted this yesterday in /r/self...",19 months post corneal transplant and I can see!
3,Keratoconus,informations on how to treat keratoconus and s...,1faa1w,1369853969,"Hi, my cousin was diagnosed with Keratoconus, ...",Informations on how to treat Keratoconus and s...
4,Keratoconus,any help for a newly diagnosed got diagnosed a...,1ddiew,1367279940,Got diagnosed a few days ago. My left eye is a...,Any help for a newly diagnosed?
...,...,...,...,...,...,...
93935,MonoHearing,recent ssd sshl this past thursday i was on a ...,bec55x,1555530344,This past Thursday I was on a flight and passe...,Recent SSD/SSHL
93936,MonoHearing,sshl question i was diagnosed with sshl on my ...,bea3ij,1555519826,I was diagnosed with SSHL on my left side back...,SSHL question
93937,MonoHearing,mono switchable headphones with individual vol...,bbmv0u,1554907552,Consideration for the useful gadgets list - I...,Mono Switchable Headphones with individual vol...
93938,MonoHearing,how to see sound in fortnite quickly _ visuali...,bavdxz,1554738446,How to See Sound in Fortnite Quickly - Visuali...,How to See Sound in Fortnite Quickly - Visuali...


- save this preprocessed data

In [8]:
# posts[['subreddit', 'all_text', 'id', 'created_utc', 'title', 'selftext']].to_csv('../Data/preprocessed_posts.csv', index=False)