-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
112 lines (91 loc) · 3.36 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# authors: Leo Cuspinera and Victor Cuspinera
# date: 2021-02-16
'''This script load tweets dataset, performs preprocessing on the comments,
and saves the dababase in a new file.
Usage: preprocess.py --input_dir=<input_dir_path> --output_dir=<destination_dir_path>
Example:
python src/preprocess.py --input_dir=tweets/ --output_dir=tweets/
Options:
--input_dir=<input_dir_path> Directory with the consolidated data (tweets_db.json)
--output_dir=<destination_dir_path> Directory for saving the preprocessed tweets (tweets_db_clean.json)
'''
# Libraries
import pandas as pd
import numpy as np
import os
import re
import spacy
import string
import time
import en_core_web_sm
nlp = en_core_web_sm.load()
from docopt import docopt
opt = docopt(__doc__)
def main(input_dir, output_dir):
print("\n--- START: preprocess.py ---")
start = time.time()
# Directories check
assert os.path.exists(input_dir), "The path entered for input_dir does not exist. Make sure to enter correct path \n"
assert os.path.exists(output_dir), "The path entered for output_dir does not exist. Make sure to enter correct path \n"
# Open tweets
print("Loading: open the Json file with all tweets")
df_tot = pd.read_json(input_dir + 'tweets_db.json')
# Run preprocess
print('Preprocess: this step could take time, please be patient')
df_tot['tweet'] = preprocess(df_tot['content'])
# Save new file
print('Saving: saves the preprocessed tweets')
df_tot.drop('content', axis=1).to_json(output_dir + 'tweets_db_clean.json')
print("Total time: time used to run preprocess.py:", np.round((time.time() - start)/60, 2), "minutes.")
print("--- END: preprocess.py ---\n")
return
def preprocess(text, irrelevant_pos = ['SPACE'],
avoid_entities = ['ORG']):
"""
Function that identify sensible information and delete some of
these data as emails and urls.
Parameters
-------------
text : (list)
the list of text to be preprocessed
irrelevant_pos : (list)
a list of irrelevant 'pos' tags
avoid_entities : (list)
a list of entity labels to be avoided
Returns
-------------
(list) list of preprocessed text
Example
-------------
example = ["Contact me at george23@gmail.com",
"@vcuspinera my webpage is https://vcuspinera.github.io"]
preprocess(example)
(output:) ['contact me at',
'my webpage is']
"""
result = []
for sent in text:
sent = str(sent).lower()
result_sent = []
doc = nlp(sent)
entities = [str(ent) for ent in doc.ents if ent.label_ in avoid_entities]
# This helps to detect names organization
for token in doc:
if (token.like_email or
token.like_url or
token.pos_ in irrelevant_pos or
str(token) in entities
):
continue
else:
if str(token) in string.punctuation:
try:
result_sent[-1] = str(result_sent[-1]) + str(token)
except:
result_sent.append(str(token))
else:
result_sent.append(str(token))
result.append(" ".join(result_sent))
return result
if __name__ == "__main__":
main(opt["--input_dir"], opt["--output_dir"])