-
Notifications
You must be signed in to change notification settings - Fork 2
/
clevercsv_test.py
122 lines (115 loc) · 4.59 KB
/
clevercsv_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import collections
import clevercsv
import time
import os
import sys
from CSV_Wrangling.data.github import clevercsv_failed_cases_test
def getCSVdata(path: str)->str:
with open(path) as f:
csvcontent: str = ' '.join(f.readlines())
return csvcontent
def DetectCSVDialect(path: str):
try:
content= getCSVdata(path)
# you can use verbose=True to see what CleverCSV does
dialect = clevercsv.Sniffer().sniff(content,delimiters=[',',';','\t','|',':','.','=','<','>',' '],verbose=True)
return dialect
except OSError as err:
print("Error was: %s" % err)
except Exception as err:
print("Error was: %s" % err)
def ImportExpectedResults(aResultFileName:str)->dict:
try:
basePath= os.path.dirname(__file__)
with open(os.path.join(basePath,aResultFileName), newline='') as csvfile:
csvFilesDict={}
csvRowDict={}
spamreader = clevercsv.reader(csvfile, delimiter='|', quotechar='')
i=0
for row in spamreader:
if len(row)>1:
if i>0:
csvRowDict={
'encoding':row[1],
'fields_delimiter':row[2],
'quotechar':row[3],
'escapechar':row[4],
'records_delimiter':row[5]
}
csvFilesDict[row[0]]=csvRowDict
else:
i+=1
return csvFilesDict
except Exception as err:
print("Error was: %s" % err)
def GetDelName(aDelim:str)->str:
if aDelim == ',':
return 'comma'
elif aDelim == ';':
return 'semicolon'
elif aDelim == '\t':
return 'tab'
elif aDelim == ' ':
return 'space'
elif aDelim == '|':
return 'vslash'
def GetQuoteName(aQuote:str)->str:
if aQuote == '"' or aQuote == '':
return 'doublequote'
elif aQuote == '\'':
return 'singlequote'
def main(basePath: str, outPath :str):
outSet=['[POLLOCK]clevercsv_output.txt','[W3C-CSVW]clevercsv_output.txt']
expectedResultsSet=['Dialect_annotations.txt','W3C-CSVW-Dialect_annotations.txt']
TestsCSVpathSet=['CSV','W3C-CSVW']
n=0
for testItem in outSet:
sys.stdout = open(os.path.join(outPath,outSet[n]), 'w')
#Import expectect results as nested dicts
ExpectedResults=ImportExpectedResults(expectedResultsSet[n])
#Get test path withing current .py file
TestsCSVpath=os.path.join(basePath,TestsCSVpathSet[n])
passed=0
failures=0
t=time.time()
#Iterate and run all test files
for filename in os.listdir(TestsCSVpath):
file = os.path.join(TestsCSVpath, filename)
#File check
if os.path.isfile(file):
try:
dialect=DetectCSVDialect(file)
except:
dialect=None
if dialect !=None:
if GetDelName(dialect.delimiter)==ExpectedResults[filename]['fields_delimiter'] and \
GetQuoteName(dialect.quotechar)==ExpectedResults[filename]['quotechar']:
tflag ='+'
passed += 1
else:
tflag ='X'
if tflag =='+':
print(tflag + '[' + filename + ']: --> cleverCSV detected: delimiter = %r, quotechar = %r'
% (dialect.delimiter, dialect.quotechar))
else:
print(tflag + '[' + filename + ']: --> cleverCSV detected: delimiter = %r, quotechar = %r'
% (dialect.delimiter, dialect.quotechar) + \
'| EXPECTED:{delimiter = %r, quotechar = %r}' \
% (ExpectedResults[filename]['fields_delimiter'], ExpectedResults[filename]['quotechar']))
else:
print("X [" + filename + "]: --> No result from cleverCSV")
failures += 1
n+=1
print('[Passed test ratio]--: %r' %(round(100*passed/len(ExpectedResults),4)) +'%')
print('[Failure ratio]--: %r' %(round(100*failures/len(ExpectedResults),4)) +'%')
print('[Elapsed time]--: %r seconds' %(round(time.time()-t,2)))
sys.stdout.close()
if __name__ == "__main__":
basePath= os.path.dirname(__file__)
if os.path.exists(os.path.join(basePath, 'cleverCSV')) !=True:
os.makedirs(os.path.join(basePath, 'cleverCSV'))
main(basePath, os.path.join(basePath,'cleverCSV'))
#Test over all files
clevercsv_failed_cases_test.main(os.path.join(basePath, 'CSV_Wrangling', 'data', 'github'),os.path.join(basePath,'cleverCSV'))
#Exclude normal CSV files
clevercsv_failed_cases_test.main(os.path.join(basePath, 'CSV_Wrangling', 'data', 'github'),os.path.join(basePath,'cleverCSV'),True)