In [1]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud,STOPWORDS
import nltk
from datetime import datetime, date
import sys

#check for current working path, and set the working path to the data folder

print(Path.cwd())
os.chdir('../data')



/Users/veochae/Desktop/Dreams/data cleaning


In [2]:
df = pd.read_csv("./raw_data.csv", index_col= 0)

In [3]:
df['date'] = [datetime.fromtimestamp(time) for time in df['date']]
df= df.dropna()
df

Unnamed: 0,subreddit,title,text,date
0,Dreams,I think it was my grandad,"So, I had this dream when I was a kid , age 9...",2023-02-05 21:13:41
1,Dreams,A nondescript blurry figure standing over me,Im laying in bed face up where a blurry unknow...,2023-02-05 20:58:10
2,Dreams,Dream my family hated me because I am disabled,I had this dream 3 nights ago and it still has...,2023-02-05 20:29:39
3,Dreams,Furry exhibitionists on a sidewalk,Oh my god it was awful.\n\nSo awful. \n\nI was...,2023-02-05 20:29:22
5,Dreams,I had deja vu 3 times last week,For context: I don’t dream often but when I do...,2023-02-05 20:02:04
...,...,...,...,...
982,Dreams,Several months ago I had reoccurring dreams ab...,&amp;#x200B;\n\nhttps://preview.redd.it/8zi4ej...,2023-01-27 06:43:28
984,Dreams,Fudge Shop Dream,Today I dreamed about being at a large fudge s...,2023-01-27 06:18:24
985,Dreams,I killed a man that travelled through time,I was running around in a mansion trying to se...,2023-01-27 05:53:28
986,Dreams,Recurring dream,"As the title says. It's a recurring dream, but...",2023-01-27 05:47:53


In [4]:
df['text']

0      So, I  had this dream when I was a kid , age 9...
1      Im laying in bed face up where a blurry unknow...
2      I had this dream 3 nights ago and it still has...
3      Oh my god it was awful.\n\nSo awful. \n\nI was...
5      For context: I don’t dream often but when I do...
                             ...                        
982    &amp;#x200B;\n\nhttps://preview.redd.it/8zi4ej...
984    Today I dreamed about being at a large fudge s...
985    I was running around in a mansion trying to se...
986    As the title says. It's a recurring dream, but...
987    I felt a bit of shame as I agreed in the dream...
Name: text, Length: 900, dtype: object

In [7]:
nltk.download('stopwords')
nltk.download('omw-1.4')
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')
wn = nltk.WordNetLemmatizer()


def clean(text):
     # text = text.replace(',Äô', '\'')
     text = re.sub('https?://\S+|www\.\S+', '', text)
     text = re.sub(' +', ' ', text)
     text = re.sub(r'\s+', ' ', text, flags=re.I)
     text = re.sub('\[.*?\]', '', text)
     text = re.sub('\n', '', text)
     text = re.sub('\w*\d\w*', '', text)
     text = re.sub('<.*?>+', '', text)
     text = re.sub('(?<=:)\w-', '', text)
     text = re.sub('(?<=@)\w+', '', text)
     text = re.sub('@', '', text)
     text = re.sub(':', '', text)
     text = re.sub('RT', '', text)
     text = re.sub('_', "", text)
     text = re.sub("&amp;#;", "", text)
     text = text.strip()
     return text

def tokenization(text):
     text = re.split('\W+', text)
     return text

def remove_stopwords(text):
     text = [word for word in text if word not in stopword]
     return text

def stemming1(text):
     text = [ps.stem(word) for word in text]
     return text 

def stemming2(text):
     text = [s_stemmer.stem(word) for word in text]
     return text

def lemmatizer(text):
     text = [wn.lemmatize(word) for word in text]
     return text

def vectorization(li):
    vectorizer = CountVectorizer()   
    Xs = vectorizer.fit_transform(li)   
    X = np.array(Xs.todense())
    
    return X

def get_column_name(li):
     vectorizer = CountVectorizer()   
     Xs = vectorizer.fit_transform(li)   
     col_names=vectorizer.get_feature_names_out()
     col_names = list(col_names)

     return col_names

def extract_array(df):
     clean_text = df['text'].apply(lambda x:clean(x))
     tokenized = clean_text.apply(lambda x: tokenization(x.lower()))
     clean_text = clean_text = tokenized.apply(lambda x: " ".join(x))
     print("Complete: text cleaning")
     print("Complete: tokenization")
     x_stopwords = tokenized.apply(lambda x: remove_stopwords(x))
     print("Complete: stopwords removed")
     stem = x_stopwords.apply(lambda x: stemming1(x))
     print("Complete: stemming 1")
     stem = stem.apply(lambda x: stemming2(x))
     print("Complete: stemming 2")
     lemmatized = stem.apply(lambda x: lemmatizer(x))
     print("Complete: lemmatization")
     complete = lemmatized.apply(lambda x: " ".join(x))
     mapx = vectorization(complete)
     name = get_column_name(complete)
     mapx = pd.DataFrame(mapx, columns = name)
     mapx.columns = name
     print("Complete: vectorization")
     print("All Done!")

     return clean_text, tokenized, x_stopwords, stem, lemmatized, complete, mapx

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veochae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/veochae/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
clean_text, tokenized, x_stopwords, stem, lemmatized, complete, corpus = extract_array(df)

Complete: text cleaning
Complete: tokenization
Complete: stopwords removed
Complete: stemming 1
Complete: stemming 2
Complete: lemmatization
Complete: vectorization
All Done!


In [10]:
titles = ['clean_text', 'tokenized', 'x_stopwords', 'stem', 'lemmatized', 'complete', 'corpus']

for title in titles:
    vars()[title].to_csv(f'./cleaned data/{title}.csv')