forked from thepycoach/netflix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaning.py
42 lines (42 loc) · 2.93 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re
def clean_transcripts(text):
#if it has more than 100 group of '4 upper-cased words in a row' then most likely the whole
#transcript is in upper case. If so, do NOT remove the dialogues' names
if len(re.findall('[A-Z]{2,} [A-Z]{2,} [A-Z]{2,} [A-Z]{2,}', text)) > 100:
text = text.lower()
text = re.sub("'s", ' s', text) #splitting possesion e.g. "Andy's toys" -> "Andy s toys". it helps in vocabulary coverage. gives 's'. Count Vectorizer will ignore it
text = re.sub("'re", ' re', text) # gives 're'. Count Vectorizer will NOT ignore it
text = re.sub("'d", ' d', text) # gives 'd'. Count Vectorizer will ignore it
text = re.sub("can't", 'cannot', text)
text = re.sub("won't", 'will not', text)
text = re.sub("n't", ' not', text)
text = re.sub("'ll", ' will', text)
text = re.sub("'ve", ' have', text)
text = re.sub("'m", ' am', text)
text = re.sub('-',' ',text) #must include this one in next rounf !!!
text = re.sub('\[.*?\]','',text) #get rid of words inside square brackets + sq. brackets
text = re.sub('\(.*?\)','',text) #get rid of words inside parentheses + parentheses
text = re.sub('[^\w\s]','',text) #get rid of punctuation
text = re.sub('\w*\d\w*','',text) #remove digits or words containing digits
text = re.sub('_','', text) #transcript contains this: __hello
text = re.sub(r'\w*([a-z])\1{2,}\w*','', text) # repetitive letters aah ohhhh: you don't need \b cause it needs 3 repetitive letters together
else:
text = re.sub('[A-Z]{2,}', '', text) # get rid of dialogues' names e.g. WOODY: bla bla bla. BUZZ: bla
text = text.lower()
text = re.sub("'s", ' s', text) #splitting possesion e.g. "Andy's toys" -> "Andy s toys". it helps in vocabulary coverage. gives 's'. Count Vectorizer will ignore it
text = re.sub("'re", ' re', text) # gives 're'. Count Vectorizer will NOT ignore it
text = re.sub("'d", ' d', text) # gives 'd'. Count Vectorizer will ignore it
text = re.sub("can't", 'cannot', text)
text = re.sub("won't", 'will not', text)
text = re.sub("n't", ' not', text)
text = re.sub("'ll", ' will', text)
text = re.sub("'ve", ' have', text)
text = re.sub("'m", ' am', text)
text = re.sub('-',' ',text) #must include this one in next rounf !!!
text = re.sub('\[.*?\]','',text) #get rid of words inside square brackets + sq. brackets
text = re.sub('\(.*?\)','',text) #get rid of words inside parentheses + parentheses
text = re.sub('[^\w\s]','',text) #get rid of punctuation
text = re.sub('\w*\d\w*','',text) #remove digits or words containing digits
text = re.sub('_','', text) #transcript contains this: __hello
text = re.sub(r'\w*([a-z])\1{2,}\w*','', text) # repetitive letters aah ohhhh: you don't need \b cause it needs 3 repetitive letters together
return text