In [2]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import datetime as datetime

#Stopwords
from sklearn.feature_extraction import text
from stop_words import get_stop_words
from nltk.corpus import stopwords

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

In [3]:
sep_sub = pd.read_csv('./datasets/sep_sub_final_cleaned.csv', low_memory=False, parse_dates=['created_utc'])
sep_sub['covid_onset'] = sep_sub['created_utc'].apply(lambda x: 'yes' if x >= datetime.datetime(2020,4,1) else 'no')

In [4]:
sep_sub = sep_sub[['id','num_comments','score', 'link_flair_text','subreddit','covid_onset','text','cleaned_text']]

In [5]:
sep_sub['subreddit'] = sep_sub['subreddit'].apply(lambda x: 'submission')

In [6]:
sep_sub.rename(columns = {'text':'original_text', 'subreddit':'post'}, inplace = True)
print(sep_sub.shape)
sep_sub.head()

(14845, 8)


Unnamed: 0,id,num_comments,score,link_flair_text,post,covid_onset,original_text,cleaned_text
0,24ck1c,0,1,,submission,no,has anyone ever used this lipstick?,has anyone ever used this lipstick
1,2paha7,1,1,,submission,no,Are there any gift sets worth getting this hol...,Are there any gift sets worth getting this hol...
2,2qs93c,1,1,,submission,no,Sephora Free 2014 MAKE UP FOR EVER Birthday Se...,Sephora Free 2014 MAKE UP FOR EVER Birthday Se...
3,34ws35,0,1,,submission,no,In a Gift Box Reviews - Sephora Gift Cards,In a Gift Box Reviews Sephora Gift Cards
4,3ase55,2,2,,submission,no,"Sephora VIB Rouge, Is it worth it? Do new VIB ...",Sephora VIB Rouge Is it worth it Do new VIB RO...


In [7]:
#prep for mapping 'link_flair_text' and 'num_comments' to sub_cmt df
sep_sub.set_index('id', inplace = True)

In [8]:
sep_cmt = pd.read_csv('./datasets/sep_cmt_final_cleaned.csv', low_memory=False, parse_dates=['created_utc'])
sep_cmt['covid_onset'] = sep_cmt['created_utc'].apply(lambda x: 'yes' if x >= datetime.datetime(2020,4,1) else 'no')

In [9]:
sep_cmt = sep_cmt[['parent_id','score','subreddit','covid_onset','body','cleaned_text']]

In [10]:
sep_cmt['subreddit'] = sep_cmt['subreddit'].apply(lambda x: 'comment')

In [11]:
sep_cmt.rename(columns = {'parent_id':'id', 'subreddit':'post','body':'original_text'}, inplace = True)
print(sep_cmt.shape)
sep_cmt.head()

(193958, 6)


Unnamed: 0,id,score,post,covid_onset,original_text,cleaned_text
0,3ase55,1,comment,no,You have to spend 1000$ in a year to be approv...,You have to spend 1000 in a year to be approve...
1,3ase55,1,comment,no,sorry you also get free In Store makeovers ( u...,sorry you also get free In Store makeovers usu...
2,3gf2ep,1,comment,no,Thank u for letting me know about that,Thank you for letting me know about that
3,3ase55,1,comment,no,It is worth it if you like getting your makeup...,It is worth it if you like getting your makeup...
4,3smzui,1,comment,no,I usually order from Sephora in France but the...,I usually order from Sephora in France but the...


In [12]:
#prep for mapping sep_sub values
sep_cmt['num_comments'] = ""
sep_cmt['link_flair_text'] = ""
sep_cmt.head()

Unnamed: 0,id,score,post,covid_onset,original_text,cleaned_text,num_comments,link_flair_text
0,3ase55,1,comment,no,You have to spend 1000$ in a year to be approv...,You have to spend 1000 in a year to be approve...,,
1,3ase55,1,comment,no,sorry you also get free In Store makeovers ( u...,sorry you also get free In Store makeovers usu...,,
2,3gf2ep,1,comment,no,Thank u for letting me know about that,Thank you for letting me know about that,,
3,3ase55,1,comment,no,It is worth it if you like getting your makeup...,It is worth it if you like getting your makeup...,,
4,3smzui,1,comment,no,I usually order from Sephora in France but the...,I usually order from Sephora in France but the...,,


In [13]:
sep_cmt.set_index('id', inplace = True)

In [14]:
sep_cmt['num_comments'] = sep_cmt.index.map(sep_sub['num_comments'])

In [15]:
sep_cmt['link_flair_text'] = sep_cmt.index.map(sep_sub['link_flair_text'])

In [16]:
sep_cmt.sort_index()

Unnamed: 0_level_0,score,post,covid_onset,original_text,cleaned_text,num_comments,link_flair_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3ase55,1,comment,no,You have to spend 1000$ in a year to be approv...,You have to spend 1000 in a year to be approve...,2.0,
3ase55,1,comment,no,sorry you also get free In Store makeovers ( u...,sorry you also get free In Store makeovers usu...,2.0,
3ase55,1,comment,no,It is worth it if you like getting your makeup...,It is worth it if you like getting your makeup...,2.0,
3gf2ep,1,comment,no,Thank u for letting me know about that,Thank you for letting me know about that,0.0,
3smzui,1,comment,no,I usually order from Sephora in France but the...,I usually order from Sephora in France but the...,1.0,
...,...,...,...,...,...,...,...
t3f0rk,1,comment,yes,The usb charger is but the $75 purchase has be...,The usb charger is but the 75 purchase has bee...,0.0,Misc
t3fbp2,1,comment,yes,I love the first aid beauty eye cream!,I love the first aid beauty eye cream,0.0,Advice
t3fbp2,1,comment,yes,Clean beauty has a different meaning for every...,Clean beauty has a different meaning for every...,0.0,Advice
t3fbp2,1,comment,yes,https://www.shoprescuespa.com/liposmose-serum-...,BR is high end but this serum for me has been ...,0.0,Advice


In [17]:
sep = pd.concat([sep_sub, sep_cmt], ignore_index=True)

In [18]:
sep.shape

(208803, 7)

In [19]:
sep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208803 entries, 0 to 208802
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   num_comments     101110 non-null  float64
 1   score            208803 non-null  int64  
 2   link_flair_text  58344 non-null   object 
 3   post             208803 non-null  object 
 4   covid_onset      208803 non-null  object 
 5   original_text    208803 non-null  object 
 6   cleaned_text     208803 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 11.2+ MB


In [20]:
scikit_stop = text.ENGLISH_STOP_WORDS
print(len(text.ENGLISH_STOP_WORDS))
print(text.ENGLISH_STOP_WORDS)

318
frozenset({'around', 'do', 'cant', 'however', 'none', 'still', 'yourselves', 'describe', 'behind', 'a', 'should', 'who', 'been', 'done', 'inc', 'this', 'latter', 'somewhere', 'see', 'go', 'take', 'it', 'wherever', 'top', 'ourselves', 'anyway', 'thru', 'nothing', 'will', 'was', 'against', 'next', 'or', 'also', 'everyone', 'de', 'much', 'twenty', 'four', 'are', 'upon', 'hundred', 'throughout', 'become', 'sometimes', 'interest', 'mine', 'detail', 'everything', 'empty', 'whereas', 'six', 'during', 'no', 'many', 'in', 'himself', 'thin', 'becoming', 'sincere', 'ever', 'serious', 'formerly', 'some', 'between', 'by', 'with', 'on', 'not', 'well', 'alone', 'their', 'moreover', 'couldnt', 'an', 'ours', 'which', 'namely', 'yours', 'me', 'while', 'side', 'except', 'latterly', 'made', 'above', 'less', 'everywhere', 'whether', 'out', 'somehow', 'noone', 'bottom', 'again', 'hereby', 'former', 'must', 'amoungst', 'we', 'ie', 'seemed', 'can', 'five', 'last', 'several', 'myself', 'is', 'and', 'withou

In [21]:
nltk_stop = nltk.corpus.stopwords.words('english')
print(len(nltk_stop))
print(nltk_stop)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [22]:
get_stop = get_stop_words('english')
print(len(get_stop_words('english')))
print(get_stop_words('english'))

174
['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves'

In [23]:
s1 = set(nltk_stop)
s2 = set(get_stop)
#stopwords in nltk but not get stop words
print(len(s1 - s2))
s1 - s2

37


{'ain',
 'aren',
 'can',
 'couldn',
 'd',
 'didn',
 'doesn',
 'don',
 'hadn',
 'hasn',
 'haven',
 'isn',
 'just',
 'll',
 'm',
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 'needn',
 "needn't",
 'now',
 'o',
 're',
 's',
 'shan',
 "should've",
 'shouldn',
 't',
 "that'll",
 've',
 'wasn',
 'weren',
 'will',
 'won',
 'wouldn',
 'y'}

In [24]:
s1 = set(nltk_stop)
s2 = set(get_stop)
#stopwords in get_stop_words but not in nltk
print(len(s2 - s1))
s2 - s1

32


{"can't",
 'cannot',
 'could',
 "he'd",
 "he'll",
 "he's",
 "here's",
 "how's",
 "i'd",
 "i'll",
 "i'm",
 "i've",
 "let's",
 'ought',
 "she'd",
 "she'll",
 "that's",
 "there's",
 "they'd",
 "they'll",
 "they're",
 "they've",
 "we'd",
 "we'll",
 "we're",
 "we've",
 "what's",
 "when's",
 "where's",
 "who's",
 "why's",
 'would'}

In [25]:
add_stop_words = ['A', 'also', 'Am', 'but', 'can', 'I','If', 'Is', 'It', 'Its', 'just', 'My','since', 'So', 'still','That', 'The', 'They', 'This', 'though', 'want', 'well', 'Well','And', 'But', 'Not', 'You','Have','Has', 'go','much','Sephora', 'sephora','get','Get','got','Try','try','tried','use','used','will','make','really']


In [26]:
#going with get_stop_words, doesn't remove words that could be a key word in sephora corpus, eg: first - which could be 'first aid beauty' or 'first rate item'
get_stop.extend(add_stop_words)
stopwords = get_stop

In [27]:
print(stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 't

In [28]:
lemmatizer = WordNetLemmatizer()

In [29]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [30]:
def lower_lem_stop(df):    
    df['stop_cleaned'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
       
    df['stop_lem_lower_cleaned'] = df['stop_cleaned'].apply(lambda x: x.lower()).apply(lambda x: ' '.join([lemmatizer.lemmatize(i) for i in x.split()])) 
    
    df['stop_stem_lower_cleaned'] = df['stop_cleaned'].apply(lambda x: x.lower()).apply(lambda x: ' '.join([stemmer.stem(i) for i in x.split()]))

    return df

In [31]:
lower_lem_stop(sep)

Unnamed: 0,num_comments,score,link_flair_text,post,covid_onset,original_text,cleaned_text,stop_cleaned,stop_lem_lower_cleaned,stop_stem_lower_cleaned
0,0.0,1,,submission,no,has anyone ever used this lipstick?,has anyone ever used this lipstick,anyone ever lipstick,anyone ever lipstick,anyon ever lipstick
1,1.0,1,,submission,no,Are there any gift sets worth getting this hol...,Are there any gift sets worth getting this hol...,Are gift sets worth getting holiday season,are gift set worth getting holiday season,are gift set worth get holiday season
2,1.0,1,,submission,no,Sephora Free 2014 MAKE UP FOR EVER Birthday Se...,Sephora Free 2014 MAKE UP FOR EVER Birthday Se...,Free 2014 MAKE UP FOR EVER Birthday Set With A...,free 2014 make up for ever birthday set with a...,free 2014 make up for ever birthday set with a...
3,0.0,1,,submission,no,In a Gift Box Reviews - Sephora Gift Cards,In a Gift Box Reviews Sephora Gift Cards,In Gift Box Reviews Gift Cards,in gift box review gift card,in gift box review gift card
4,2.0,2,,submission,no,"Sephora VIB Rouge, Is it worth it? Do new VIB ...",Sephora VIB Rouge Is it worth it Do new VIB RO...,VIB Rouge worth Do new VIB ROUGE members welco...,vib rouge worth do new vib rouge member welcom...,vib roug worth do new vib roug member welcom b...
...,...,...,...,...,...,...,...,...,...,...
208798,,1,,comment,yes,Right? We’re not Jeff Bezos yeesh. It’s sugar!,Right Were not Jeff Bezos yeesh Its sugar,Right Were Jeff Bezos yeesh sugar,right were jeff bezos yeesh sugar,right were jeff bezo yeesh sugar
208799,,1,,comment,yes,Aquaphor is amazing!,Aquaphor is amazing,Aquaphor amazing,aquaphor amazing,aquaphor amaz
208800,,1,,comment,yes,Less than 1/4 ounce of sugar and oil at that.,Less than 14 ounce of sugar and oil at that,Less 14 ounce sugar oil,le 14 ounce sugar oil,less 14 ounc sugar oil
208801,0.0,1,Advice,comment,yes,https://www.shoprescuespa.com/liposmose-serum-...,BR is high end but this serum for me has been ...,BR high end serum amazing Only takes like 23 d...,br high end serum amazing only take like 23 dr...,br high end serum amaz only take like 23 drop ...


In [32]:
duplicateRowsDF = sep[sep.duplicated()]
duplicateRowsDF.shape

(3208, 10)

In [33]:
sep.drop_duplicates(subset=None, keep= 'first', inplace=True)

In [34]:
sep.shape

(205595, 10)

In [35]:
sep

Unnamed: 0,num_comments,score,link_flair_text,post,covid_onset,original_text,cleaned_text,stop_cleaned,stop_lem_lower_cleaned,stop_stem_lower_cleaned
0,0.0,1,,submission,no,has anyone ever used this lipstick?,has anyone ever used this lipstick,anyone ever lipstick,anyone ever lipstick,anyon ever lipstick
1,1.0,1,,submission,no,Are there any gift sets worth getting this hol...,Are there any gift sets worth getting this hol...,Are gift sets worth getting holiday season,are gift set worth getting holiday season,are gift set worth get holiday season
2,1.0,1,,submission,no,Sephora Free 2014 MAKE UP FOR EVER Birthday Se...,Sephora Free 2014 MAKE UP FOR EVER Birthday Se...,Free 2014 MAKE UP FOR EVER Birthday Set With A...,free 2014 make up for ever birthday set with a...,free 2014 make up for ever birthday set with a...
3,0.0,1,,submission,no,In a Gift Box Reviews - Sephora Gift Cards,In a Gift Box Reviews Sephora Gift Cards,In Gift Box Reviews Gift Cards,in gift box review gift card,in gift box review gift card
4,2.0,2,,submission,no,"Sephora VIB Rouge, Is it worth it? Do new VIB ...",Sephora VIB Rouge Is it worth it Do new VIB RO...,VIB Rouge worth Do new VIB ROUGE members welco...,vib rouge worth do new vib rouge member welcom...,vib roug worth do new vib roug member welcom b...
...,...,...,...,...,...,...,...,...,...,...
208798,,1,,comment,yes,Right? We’re not Jeff Bezos yeesh. It’s sugar!,Right Were not Jeff Bezos yeesh Its sugar,Right Were Jeff Bezos yeesh sugar,right were jeff bezos yeesh sugar,right were jeff bezo yeesh sugar
208799,,1,,comment,yes,Aquaphor is amazing!,Aquaphor is amazing,Aquaphor amazing,aquaphor amazing,aquaphor amaz
208800,,1,,comment,yes,Less than 1/4 ounce of sugar and oil at that.,Less than 14 ounce of sugar and oil at that,Less 14 ounce sugar oil,le 14 ounce sugar oil,less 14 ounc sugar oil
208801,0.0,1,Advice,comment,yes,https://www.shoprescuespa.com/liposmose-serum-...,BR is high end but this serum for me has been ...,BR high end serum amazing Only takes like 23 d...,br high end serum amazing only take like 23 dr...,br high end serum amaz only take like 23 drop ...


In [36]:
pd.DataFrame(sep).to_csv('datasets/sep_combi_final_preprocessed.csv', index=False)