-
Notifications
You must be signed in to change notification settings - Fork 0
/
csv2txtarr.py
123 lines (97 loc) · 4.59 KB
/
csv2txtarr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Load data from CSV files:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import json
DATASET = "_jobsv1_all"
DATASET = "_jobsv1_goodq"
# This scraped dataset specific functions:
csv_files = [
"../JobAdsData/Source1-Monster/creative_jobs_clean_1363.csv",
"../JobAdsData/Source1-Monster/IT_jobs_clean_1511.csv",
"../JobAdsData/Source2-Reed/creative2_pages_clean_1359.csv",
"../JobAdsData/Source2-Reed/it2_pages_clean_1556.csv",
#"../JobAdsData/Source3-Indeed/creative2_pages_clean_4164.csv",
#"../JobAdsData/Source3-Indeed/it2_pages_clean_986.csv"
]
# Structure:
# Page_URL,JobTitle,JobDescription,Location,Type,Posted,Industries,Salary
total_lines = 0
total_ignored_short = 0
full_text_dataset = []
full_text_titles = []
lengths_descriptions = []
THR_min_desc_length_allowed = 400
THR_exclude_long_errs = 26000
THR_supposed_to_have_values = 8
for csv_file in csv_files:
print("Loading file:",csv_file)
df = pd.read_csv(csv_file, delimiter=',')
for line in df.values:
title = line[1]
description = line[2]
category_sort_of_like = line[6]
# one line contains incorrectly scraped data (np.nan):
if description is not np.nan and title is not np.nan:
# those broken by splitting (only 3 records)
if len(line) > THR_supposed_to_have_values and line[-1] is not np.nan:
#print(len(line))
#print(csv_file)
#print(line)
continue
if len(description) < THR_min_desc_length_allowed:
#print(csv_file)
#print(line)
total_ignored_short += 1
continue
if len(description) > THR_exclude_long_errs:
total_ignored_short += 1
continue
# print(title, "|||", len(description), description)
full_text_dataset.append(description)
full_text_titles.append(title)
lengths_descriptions.append(len(description))
total_lines += 1
lengths_descriptions = np.asarray(lengths_descriptions)
print("--=============================--")
print("Read",total_lines,"lines in total.")
print("Skipped",total_ignored_short,"too short lines (using thr", THR_min_desc_length_allowed,")")
if total_lines == 0:
print("No line read from the input!")
if total_lines > 0:
print("Statistics of lenghts (min, max, avg):", np.min(lengths_descriptions), np.max(lengths_descriptions),
np.average(lengths_descriptions))
#plt.plot(lengths_descriptions)
#plt.ylabel('lenghts of descriptions in scraped data')
#plt.show()
# Check the longest entry ? Seems alright.
longest_i = np.argmax(lengths_descriptions)
#print(lengths_descriptions[longest_i])
#print(full_text_dataset[longest_i])
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_punctuation
# some hacks - words were joined together just by "." separate them
for i in range(len(full_text_dataset)):
full_text_dataset[i] = full_text_dataset[i].replace('!', ' ! ')
full_text_dataset[i] = full_text_dataset[i].replace('?', ' ? ')
full_text_dataset[i] = full_text_dataset[i].replace('.', ' . ')
full_text_dataset[i] = full_text_dataset[i].replace(',', ' , ')
full_text_dataset[i] = full_text_dataset[i].replace(':', ' : ')
full_text_dataset[i] = full_text_dataset[i].replace(';', ' ; ')
full_text_dataset[i] = full_text_dataset[i].replace('(', ' ( ')
full_text_dataset[i] = full_text_dataset[i].replace(')', ' ) ')
full_text_dataset[i] = full_text_dataset[i].replace('/', ' / ')
full_text_dataset[i] = full_text_dataset[i].replace('*', ' * ')
full_text_dataset[i] = full_text_dataset[i].replace(']', ' ] ')
full_text_dataset[i] = full_text_dataset[i].replace('[', ' [ ')
full_text_dataset[i] = full_text_dataset[i].replace('&', ' & ')
full_text_dataset[i] = full_text_dataset[i].replace('_', ' _ ')
full_text_dataset[i] = remove_stopwords(full_text_dataset[i])
# slower processing from:
#full_text_dataset[i] = strip_punctuation(full_text_dataset[i]) # our default seems to be better
#full_text_dataset[i] = stem_text(full_text_dataset[i]) # do we want "porter-stemmed version" ?
full_text_dataset[i] = " ".join(list(gensim.utils.tokenize(full_text_dataset[i])))
documents = full_text_dataset
titles = full_text_titles
np.savez_compressed("data/documents"+DATASET+".npz", a=documents)
np.savez_compressed("data/titles"+DATASET+".npz", a=titles)