In [1]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud,STOPWORDS
import nltk
from datetime import datetime, date
import sys

#check for current working path, and set the working path to the data folder

print(Path.cwd())
os.chdir('../data')

/Users/veochae/Desktop/Dreams/data cleaning


In [2]:
#read in data
df = pd.read_csv("./raw data/raw_data.csv", index_col= 0)

In [3]:
#change the datetime from timestamp to datetime for ease of understanding
df['date'] = [datetime.fromtimestamp(time) for time in df['date']]
df= df.dropna()

In [4]:
#calculating length of each dream
df['length'] = [len(j) for j in df['text']]

# if less than or equal to 5th percentile, assign t_f column False
df['t_f'] = [True if j > np.percentile(df['length'], 5) else False for j in df['length']]

#only keep t_f == True rows
semi = df.loc[df['t_f'] == True, :].__deepcopy__()

#export semi-raw dataset
semi.to_csv("./raw data/semi_raw.csv")

In [5]:
nltk.download('stopwords')
nltk.download('omw-1.4')
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')
wn = nltk.WordNetLemmatizer()


def clean(text):
     text = re.sub('https?://\S+|www\.\S+', '', text) #replace website urls
     text = re.sub(r"@\S+", '', text) #replace anything that follows @
     text = re.sub(r"#\S+", '', text) #replace anything that follows #
     text = re.sub(r"[0-9]", '', text) #replace numeric
     text = re.sub(r"\n", '', text) #replace new line 
     text = re.sub("\'m", ' am ', text) 
     text = re.sub("\'re", ' are ', text) 
     text = re.sub("\'d", ' had ', text)
     text = re.sub("\'s", ' is ', text)
     text = re.sub("\'ve", ' have ', text)
     text = re.sub(" im ", ' i am ', text)
     text = re.sub(" iam ", ' i am ', text)
     text = re.sub(" youre ", ' you are ', text)
     text = re.sub(" theyre ", ' they are ', text)
     text = re.sub(" theyve ", ' they have ', text)
     text = re.sub(" weve ", ' we have ', text)
     text = re.sub(" isnt ", ' is not ', text)
     text = re.sub(" arent ", ' are not ', text)
     text = re.sub(" ur ", ' you are ', text)
     text = re.sub(" ive ", ' i have ', text)
     text = re.sub("_", '', text)
     text = re.sub("\"", '', text)
     text = re.sub(" bc ", ' because ', text)
     text = re.sub(" aka ", ' also known as ', text)
     text = re.sub("√©", 'e', text) #encoding error for é. replace it with e
     text = re.sub(" bf  ", ' boyfriend ', text)
     text = re.sub(" gf  ", ' girlfriend ', text)
     text = re.sub(" btw  ", ' by the way ', text)
     text = re.sub(" btwn  ", ' between ', text)
     text = re.sub(r'([a-z])\1{2,}', r'\1', text) #if the same character is repeated more than twice, remove it to one. (E.A. ahhhhhh --> ah)
     text = re.sub(' ctrl ', ' control ', text)
     text = re.sub(' cuz ', ' because ', text)
     text = re.sub(' dif ', ' different ', text)
     text = re.sub(' dm ', ' direct message ', text)
     text = re.sub("n't", r' not ', text)
     text = re.sub(" fav ", ' favorite ', text)
     text = re.sub(" fave ", ' favorite ', text)
     # text = re.sub(" fk ", " fuck ", text)
     # text = re.sub(" fkin ", " fucking ", text)
     # text = re.sub(" fkn ", " fucking ", text)
     text = re.sub(" fml ", " fuck my life ", text)
     text = re.sub(" hq ", " headquarter ", text)
     text = re.sub(" hr ", " hours ", text)
     text = re.sub(" idk ",  "i do not know ", text)
     text = re.sub(" ik ", ' i know ', text)
     text = re.sub(" lol ", ' laugh out loud ', text)
     text = re.sub(" u ", ' you ', text)
     text = re.sub("√¶", 'ae', text) #encoding error for áe. replace it with ae
     text = re.sub("√® ", 'e', text) #encoding error for é. replace it with e

     # text = re.sub(r'\s+', ' ', text, flags=re.I)
     # text = re.sub('\[.*?\]', '', text)
     # text = re.sub('\n', '', text)
     # text = re.sub('\w*\d\w*', '', text)
     # text = re.sub('<.*?>+', '', text)
     # text = re.sub('(?<=:)\w-', '', text)
     # text = re.sub('(?<=@)\w+', '', text)
     # text = re.sub('@', '', text)
     # text = re.sub(':', '', text)
     # text = re.sub('_', "", text)
     # text = re.sub("&amp;#;", "", text)
     text = text.strip()
     return text

def tokenization(text):
     text = re.split('\W+', text) #split words by whitespace to tokenize words
     return text

def remove_stopwords(text):
     text = [word for word in text if word not in stopword] #remove stopwords in the nltk stopwords dictionary
     return text

def lemmatizer(text):
     text = [wn.lemmatize(word) for word in text] #lemmatize the tokenized words. Lemmatized > Stemming in this case
     return text                                  #because lemmatizing keeps the context of words alive

def vectorization(li):                            #create matrix of words and its respective presence for each dream
    vectorizer = CountVectorizer()   
    Xs = vectorizer.fit_transform(li)   
    X = np.array(Xs.todense())
    
    return X

def get_column_name(li):                          #extract each word so that it will be present in corpus as column names
     vectorizer = CountVectorizer()   
     Xs = vectorizer.fit_transform(li)   
     col_names=vectorizer.get_feature_names_out()
     col_names = list(col_names)

     return col_names

def extract_array(df):
     clean_text = df['text'].apply(lambda x:clean(x.lower()))         #first clean the text on lower cased list of dreams
     tokenized = clean_text.apply(lambda x: tokenization(x))          #tokenize the cleaned text
     clean_text = tokenized.apply(lambda x: " ".join(x))              #rejoin the words (just in case white space still present)
     print("Complete: text cleaning")
     print("Complete: tokenization")
     x_stopwords = tokenized.apply(lambda x: remove_stopwords(x))     #remove stopwords from tokenized list
     print("Complete: stopwords removed")
     lemmatized = x_stopwords.apply(lambda x: lemmatizer(x))          #lemmatize the removed stopwords word list
     print("Complete: lemmatization")
     complete = lemmatized.apply(lambda x: " ".join(x))               #rejoin the words so it will look like a sentence
     mapx = vectorization(complete)                                   #start of mapping to corpus
     name = get_column_name(complete)
     mapx = pd.DataFrame(mapx, columns = name)
     mapx.columns = name
     print("Complete: vectorization")
     print("All Done!")

     return clean_text, tokenized, x_stopwords, lemmatized, complete, mapx

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veochae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/veochae/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
#run the main function
clean_text, tokenized, x_stopwords, lemmatized, complete, corpus = extract_array(semi)

Complete: text cleaning
Complete: tokenization
Complete: stopwords removed
Complete: lemmatization
Complete: vectorization
All Done!


In [7]:
#lower title name as well just in case we want to compare with dream content later
semi['title'] = [j.lower() for j in semi['title']]

In [8]:
#list to be used as index in forloop
titles = ['clean_text', 'tokenized', 'x_stopwords', 'lemmatized', 'complete']

#take the title and each cleaned versions of the text, transform to dataframe, then export to csv
for context in titles:
    x = pd.DataFrame({'title': semi['title'],
          context: vars()[context]}).reset_index()
    x = x.drop("index", axis =1)
    x.to_csv(f'./cleaned data/{context}.csv')

#corpus is a dataframe of its own since it is a whole new matrix. Therefore, extracted to csv separately.
corpus.to_csv("./cleaned data/corpus.csv")