# Pre-Processing Depression Dataset
tutorial video: https://www.youtube.com/watch?v=HVBk2Ge_Q98
contractions: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
removing punctuations: https://www.pluralsight.com/guides/importance-of-text-pre-processing
overall: https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/


### Import Libraries 

In [None]:
# import libraries 

import pandas as pd
import numpy as np
import string
import re
from textblob import TextBlob
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

### Mount Google Drive 

In [None]:
#Start by connecting gdrive into the google colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#!ls /content/gdrive/MyDrive/'Depression Project'/

### Read Data 

In [None]:
# making data frame from csv file 
# change file path accordingly 
data = pd.read_csv("/content/gdrive/MyDrive/Depression Project/RomeroDataset.csv") 
data = data.astype(str)
data.shape

(10314, 3)

### Drop Duplicates 

In [None]:
# sorting by aplhabetical text order  
data.sort_values("Text", inplace = True) 
  
# dropping duplicte values except the first occurence
# since keep is first in default, don't need add parameter keep 
data.drop_duplicates(subset ="Text", inplace = True) 

In [None]:
# new shape after dropping duplicates 
data.shape

(10282, 3)

In [None]:
def max_len(x):
    a=x.split()
    return len(a)

max(data['Text'].apply(max_len))

92

In [None]:
# see the labels in the DataFrame
data.columns

Index(['Unnamed: 0', 'Text', 'Target'], dtype='object')

### Drop null Values

In [None]:
# number of null value 
data.isnull().sum()
data.dropna()

Unnamed: 0.1,Unnamed: 0,Text,Target
1128,113845.0,Logging it out. Thank you Kiana...you're so...,1.0
3477,353737.0,@remysoon,1.0
961,97521.0,I'm happy and content,1.0
4459,448655.0,Love you guys! I'm going to work in a few. Kn...,1.0
1020,103728.0,My mom likes Milow's version of Ayo Technolog...,1.0
...,...,...,...
3076,312614.0,{YAWN} Good Morning Twitters,1.0
1450,148030.0,~ POURED @ the Nats game but I was prepared: u...,1.0
7450,743321.0,Â©â€¢Â©â€¢Â© Brand your business by putting y...,1.0
4752,477804.0,æŽ¨ (03:36:54 HKT) ä½ ä¸?æ‡‚æ?¨æ?¨åœ°è·Ÿä»–ã€€...,1.0


### Check Spelling mistakes (typos)

In [None]:
#spellcheck
# can take a long time 
from textblob import TextBlob
data["Text"].apply(lambda x: str(TextBlob(x).correct()))

### Lower case Text

In [None]:
# lowercase all the letters 
data["Text"] = data["Text"].str.lower()
data.head()

Unnamed: 0.1,Unnamed: 0,Text,Target
1128,113845.0,logging it out. thank you kiana...you're so...,1.0
3477,353737.0,@remysoon,1.0
961,97521.0,i'm happy and content,1.0
4459,448655.0,love you guys! i'm going to work in a few. kn...,1.0
1020,103728.0,my mom likes milow's version of ayo technolog...,1.0


### Expand Contractions 

In [None]:
# create a key:value dictionary 
# expand contractions 
# rephrase bad words 
# remove twitter shortcut rt (since it is very common)
# only leave the most relevant values to simplify tweets 
contractionsAndBadWords = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is / how does",
    "i'd": "I had / I would",
    "i'd've": "I would have",
    "i'll": "I shall / I will",
    "i'll've": "I shall have / I will have",
    "i'm": "I am",
    "im": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have",
    "fucking": "f_word",
    "fuck" : "f_word", 
    "shit" : "s_word", 
    "rt": "",
    "tryna" : "try to", 
    "lol" : "laughing out loud", 
    "idc" : "i do not care", 
    "lil" : "little", 
    "omg" : "oh my god", 
    "tbh" : "to be honest", 
    }

In [None]:
# create a method to replace the keys with their values
def cont_to_exp(x):
    if type(x) is str:
        # remove '\'
        x=x.replace('\\', '')
        for key in contractionsAndBadWords:
            value=contractionsAndBadWords[key]
            # replace contracted words with expanded
            x= x.replace(key,value)
        return x
    else:
        return x

In [None]:
# expand contracted words in text 
data['Text']=data['Text'].apply(lambda x:cont_to_exp(x))
# print the first 50 tweets 
for index,text in enumerate(data["Text"][129:186]):
  print('Tweet %d:\n'%(index+1),text)

Tweet 1:
 &quot;and every story i ever told is pa of you&quot; - yellowcard: how i go 
Tweet 2:
 &quot;bohica2k book of the month-june&quot; is a fresh post at http://bohica2k.com. check out what i have my nose buried in.  
Tweet 3:
 &quot;but you are just so cool, run your hand through your hair, absent mindedly making me want you.&quot; - taylor swift  
Tweet 4:
 &quot;daddy&quot; bought me an air conditioner! 
Tweet 5:
 &quot;expense claI ams rules in full: 1)all claI ams made by mps are within the rules.2)all rules are made by mps.3)er...4)that has / that is it&quot;-private eye 
Tweet 6:
 &quot;frooooooot!!!  nom nom nom nummy!!&quot;    my little girl sure does love her fruit 
Tweet 7:
 &quot;if u seek amy&quot; finally makes sense to me...I am so slow 
Tweet 8:
 &quot;it only takes one person to retweet it&quot; http://bit.ly/pdwh7 
Tweet 9:
 &quot;only those who have learned the power of sincere &amp; selfless contribution experience life's deepest joy: true fulfillment.&quot; 

### Remove Punctuations (after expanding contractions)

In [None]:
# print the punctuations form the string library 
punctuations = string.punctuation
print('list of punctuations:', punctuations)
# create a method that will remove the punctuations 
def punctuation_cleaning(x):
    return x.translate(str.maketrans('', '', punctuations))

list of punctuations: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
data['Text']=data['Text'].apply(lambda x:punctuation_cleaning(x))
# expand contracted words in depressed text 
data['Text']=data['Text'].apply(lambda x:cont_to_exp(x))
# print the first 50 tweets 
for index,text in enumerate(data["Text"][1:50]):
  print('Tweet %d:\n'%(index+1),text)

Tweet 1:
  remysoon
Tweet 2:
  I am happy and content
Tweet 3:
  love you guys I am going to work in a few know why I am so happy cause i get to sleep tonight
Tweet 4:
  my mom likes milows version of ayo technology it has  it is a good thing she does not have a clue what it has  it is about
Tweet 5:
  oh my god greatest feeling ever  this is better than drugs
Tweet 6:
  stray cats  stray cat strut â™« httpblipfm890vq
Tweet 7:
  travel is soed went into town  and found national express had disappeared s thankyou porn for the internet i now have an eticket
Tweet 8:
  ty  now quit readin my email already will ya p  sotm bible quiz httptinyurlcompxbhf4
Tweet 9:
  well we have 2 new colts I shall  I will be posting them on our myspace and website join us on myspace httptinyurlcomcnyn7l
Tweet 10:
  and hugladypn quotis there a waiting line to get into club blip  quot  httpblipfm7d2a5
Tweet 11:
  go to cinema and watch hannah montana
Tweet 12:
  httpwwwzavczzavenâ  if you feel like ending yo

### Remove URLs and Hyperlinks 

In [None]:
# count number of emoji in text
# since it only accepts string, join the text into a list first 
listtext = ''.join(data['Text'].tolist())
emoji = len(re.findall(u'[\U0001f600-\U0001f650]', listtext))
print(emoji)

0


In [None]:
# find number of links 
url = listtext.count("http")
print(url)

1153


In [None]:
# remove url links to remove noise 
data['Text'] = data['Text'].str.replace('http\S+|www.\S+', '', case=False)

for index,text in enumerate(data["Text"][1:50]):
  print('Tweet %d:\n'%(index+1),text)

Tweet 1:
  remysoon
Tweet 2:
  I am happy and content
Tweet 3:
  love you guys I am going to work in a few know why I am so happy cause i get to sleep tonight
Tweet 4:
  my mom likes milows version of ayo technology it has  it is a good thing she does not have a clue what it has  it is about
Tweet 5:
  oh my god greatest feeling ever  this is better than drugs
Tweet 6:
  stray cats  stray cat strut â™« 
Tweet 7:
  travel is soed went into town  and found national express had disappeared s thankyou porn for the internet i now have an eticket
Tweet 8:
  ty  now quit readin my email already will ya p  sotm bible quiz 
Tweet 9:
  well we have 2 new colts I shall  I will be posting them on our myspace and website join us on myspace 
Tweet 10:
  and hugladypn quotis there a waiting line to get into club blip  quot  
Tweet 11:
  go to cinema and watch hannah montana
Tweet 12:
    if you feel like ending your life try zav it has  it is a very good programme to teach you how to write using all 

### Count Emojis but don't remove (may be useful for DL models)

In [None]:
# #------------ didn't remove emojis since as they represent feelings which can be used during deep learning -----------#

# #remove the emojis 
# emoji_pattern = re.compile("["
#       u"\U0001F600-\U0001F64F"  # emoticons
#       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#        u"\U0001F680-\U0001F6FF"  # transport & map symbols
#        "]+", flags=re.UNICODE)

# listtext = emoji_pattern.sub(r'', listtext)
# print(listtext)

### Remove most common Words (Stopwords)

In [None]:
# set stopwords as variable stop
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# remove all the stopwords 
data["Text"] = data["Text"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

### Remove Rare Words 

In [None]:
text = ' '.join(data["Text"])
print(len(text))
text = text.split()

629589


In [None]:
freq_comms = pd.Series(text).value_counts()
rare = freq_comms.tail(3000)
print(rare)
data["Text"] = data["Text"].apply(lambda x: ' '.join([t for t in x.split() if t not in rare]))

realgoogle       1
lipsync          1
nightdef         1
anywaynothing    1
rkprincess       1
                ..
murals           1
alans            1
puppylick        1
gtmy             1
elvis0725        1
Length: 3000, dtype: int64


In [None]:
for index,text in enumerate(data["Text"][1:50]):
  print('Tweet %d:\n'%(index+1),text)

Tweet 1:
 remysoon
Tweet 2:
 i happy content
Tweet 3:
 love guys i going work know i happy cause get sleep tonight
Tweet 4:
 mom likes milows version ayo technology good thing clue
Tweet 5:
 oh god greatest feeling ever better drugs
Tweet 6:
 stray cats stray cat strut â™«
Tweet 7:
 travel soed went town found national express disappeared thankyou porn internet
Tweet 8:
 ty quit email already ya p sotm bible quiz
Tweet 9:
 well 2 new colts i shall i posting myspace website join us myspace
Tweet 10:
 quotis waiting line get club blip quot
Tweet 11:
 go cinema watch hannah montana
Tweet 12:
 feel like ending life try zav good programme teach write using ten fingers better razor fighting depression know wanna anything good way end pain
Tweet 13:
 â¦ response chris hipkins replied students depression counselling 19982018
Tweet 14:
 matter live gender old areâsliding pair running shoes may best first step warding depressionemoji man runningemoji man runningemoji man runningtxg compressop

### Check for Null values Again (after pre-processing)

In [None]:
data.dropna(inplace=True)
data.isnull().sum()

Unnamed: 0    0
Text          0
Target        0
dtype: int64

### Save as a new Processed Document

In [None]:
# save processed csv file as a new file 
data.to_csv('/content/gdrive/MyDrive/Depression Project/RomeroDatasetProcessed.csv',index=False)

In [None]:
processedData = pd.read_csv('/content/gdrive/MyDrive/Depression Project/RomeroDatasetProcessed.csv')

In [None]:
processedData.shape

(10282, 3)

In [None]:
for index,text in enumerate(processedData["Text"][1:50]):
  print('Tweet %d:\n'%(index+1),text)

Tweet 1:
 remysoon
Tweet 2:
 i happy content
Tweet 3:
 love guys i going work know i happy cause get sleep tonight
Tweet 4:
 mom likes milows version ayo technology good thing clue
Tweet 5:
 oh god greatest feeling ever better drugs
Tweet 6:
 stray cats stray cat strut â™«
Tweet 7:
 travel soed went town found national express disappeared thankyou porn internet
Tweet 8:
 ty quit email already ya p sotm bible quiz
Tweet 9:
 well 2 new colts i shall i posting myspace website join us myspace
Tweet 10:
 quotis waiting line get club blip quot
Tweet 11:
 go cinema watch hannah montana
Tweet 12:
 feel like ending life try zav good programme teach write using ten fingers better razor fighting depression know wanna anything good way end pain
Tweet 13:
 â¦ response chris hipkins replied students depression counselling 19982018
Tweet 14:
 matter live gender old areâsliding pair running shoes may best first step warding depressionemoji man runningemoji man runningemoji man runningtxg compressop