In [1]:
import pandas as pd
import re
from cn2an import cn2an, an2cn
from os import listdir
from os.path import isfile, join

In [2]:
dataPath = '../data/'
rawDataPath = '../rawdata/'
filenameList = [f for f in listdir(rawDataPath) if isfile(join(rawDataPath, f))]  # get all csv under raw data path
month2num = {'January': '1', 'February': '2','March': '3','April': '4','May': '5','June': '6','July': '7', 
             'Auguest': '8', 'September': '9','October': '10', 'November': '11', 'December': '12',}

In [3]:
textNumber1 = '[零一二三四五六七八九十壹貳贰参參叄叁肆伍陸陆柒捌玖拾廿百佰千仟萬万點点]+'
textNumber2 = '[零一二三四五六七八九十壹貳贰参參叄叁肆伍陸陆柒捌玖拾廿百佰千仟萬万點点0123456789]+'
textNumberDict = {'貳': '贰', '参': '叁', '參': '叁', '叄': '叁', '陸': '陆', '點': '点', '萬': '万'}
def trim_space(data):
    return data.replace(' ', '')

def text_transform(text):
    tmp = ''
    for t in text:
        if t in textNumberDict:
            tmp += textNumberDict[t]
        else:
            tmp += t
    return tmp

def search_main_text(data):
    l = re.findall(r'主文[^。]*。{1}', data)
    if len(l) != 0:
        if len(re.findall(r'酒精', l[0])) != 0:
            return l[0][2:]
    return 'Not Found'
# print(search_main_text('主文asdfasdf。。'))

def search_judge(data):
    l = re.findall(r'法官(?:..|...)(?:以上正|上列正|上正本|本正本|如不服|書記官)', data)
#     l = re.findall(r'法[ ]*官[ ]*(?:[^ ]{3}|[^ ]{2}|[^ ] [^ ])[ ]*(?:以上|上列|上正)', data)
    return l[0].replace(' ', '')[2:-3] if len(l) != 0 else 'Not Found'
# print(search_judge('虎尾簡易庭法官張俊上正本證明與原本無異。 書'))

def adjust_judge(data):
    l = re.findall(r'Not Found', data)
    return data if len(l) == 0 else 'Not Found'


def search_place(data):
    l = re.findall(r'裁判字號：臺灣..地方法院', data)
    return l[0][7:9] if len(l) != 0 else 'Not Found'

def search_fine(data):
    l  = re.findall(r'併科罰金新臺幣[^元]*元', data)
    if len(l):
        text = re.findall(r'{}'.format(textNumber2), l[0])
        return cn2an(text_transform(text[0]), 'smart')
    else:
        return 0

def search_imprison(data):
    transDict = {'月': 30, '年': 360}
    l  = re.findall(r'處有期徒刑[^月年]*[月年]', data)
    if len(l):
        text = l[0]
        unit = '年' if len(re.findall(r'[年]', text)) != 0 else '月'
        text = re.findall(r'{}'.format(textNumber2), text)[0]
        val = cn2an(text_transform(text), 'smart') * transDict[unit]
        return val
    else:
        return 0
# print(search_imprison('處有期徒刑5月'))

def search_again(data): #累犯
    l = re.findall(r'累犯', data)
    return 1 if len(l) != 0 else 0

def search_alcohol(data): #酒精濃度
    l = re.findall(r'[^（）或，。、：]*酒精[^（）或，。、：]*[（）或，。、：]', data)
    if len(l) != 0:
        alcohol_air = [0.15]  # 酒測標準
        alcohol_blood = [0.03]  # 酒測標準
#         print('-'*100)
        for candidate in l:
            testType = 1 if len(re.findall(r'血液', candidate)) != 0 else 0
            text = re.findall(r'{}{}'.format(textNumber1, textNumber1), candidate)
            number = re.findall(r'[0123456789]+[.][0123456789]+', candidate)
#             print(number)
            for num in number:
                if float(num) < 10:
#                     print(num)
                    if testType == 0:
                        alcohol_air.append(float(num))
                    else:
#                         alcohol_blood.append(float(num))
                        alcohol_air.append(float(num)*5)
#             print(candidate)
            if len(text) < 1:
                continue
#             print(text[0])
            try:
                if testType == 0:
                    alcohol_air.append(cn2an(text_transform(text[0]), 'smart'))  # 呼氣
                else:
#                     alcohol_blood.append(cn2an(text_transform(text[0]), 'smart'))  # 血液
                    alcohol_air.append(cn2an(text_transform(text[0])*5, 'smart'))  # 呼氣
            except:
#                 print(text[0])
                pass
#         return [max(alcohol_air), max(alcohol_blood)]
        return max(alcohol_air)
#         return [max(alcohol_air), max(alcohol_blood)], l
    else:
        return 'Not Found'
# for s in ['主文廖國銘駕駛動力交通工具而吐氣所含酒精濃度達每公升零點二五毫克以上，', '吐氣所含酒精濃度達每公升零點二五毫克或血液中酒精濃度達百分之零點零五以上。', '測得其吐氣所含酒精濃度達每公升0.29毫克，', '並有雲林縣警察局當事人酒精測定紀錄表、', '財團法人工業技術研究院呼氣酒精測試器檢定合格證書各1份在卷可稽，', '吐氣所含酒精濃度達每公升零點二五毫克或血液中酒精濃度達百分之零點零五以上。']:
#     print(s)
#     print(search_alcohol(s))

def search_education(data):
    l = re.findall(r'(國小以下|國中|高中|大學|研究所以上)[^，。]*智識程度', data)
    return l[0] if len(l) != 0 else 'Not Found'
# print(search_education('國中sd智識程度'))

def search_vehicle(data):
    l = re.findall(r'(自用小客車|自用小貨車|自用大客車|自用大貨車|身心障礙者專用車|營業小客車|營業小貨車|營業大客車|營業大貨車|營業貨櫃曳引車|租賃小客車|租賃小貨車|交通車|遊覽大客車|大型重型機車|普通重型機車)[^，。]*', data)
    return l[0] if len(l) != 0 else 'Not Found'  # 一般第一次出現的車種是酒駕的人駕駛的

In [13]:
for fn in filenameList:
    df = pd.read_csv(rawDataPath + fn, index_col= 0, delimiter=',')
    df.columns = ['text']
    df['text'] = df['text'].apply(trim_space)
    
    df['place'] = df['text'].apply(search_place)
    df = df[df.place != 'Not Found']
    
    df['main'] = df['text'].apply(search_main_text)
    df = df[df.main != 'Not Found']

    df['judge'] = df['place'] + df['text'].apply(search_judge)
    df['judge'] = df['judge'].apply(adjust_judge)
    df['fine'] = df['main'].apply(search_fine)
    
    df['imprison'] = df['main'].apply(search_imprison)
    df = df[df.imprison >= 30]
    df = df[df.imprison <= 30 * 12]
    
    df['again'] = df['main'].apply(search_again)
    df['alcohol'] = df['text'].apply(search_alcohol)
    df = df[df['alcohol']<2]
#     df = df[df['alcohol']>=0.15]
    
    df['education'] = df['text'].apply(search_education)
    df['vehicle'] = df['text'].apply(search_vehicle)
#     df = df.reset_index(drop=True)
#     display(df)
    df.to_csv(dataPath + '109_' + month2num[fn[:-4]] + '.csv')

In [14]:
df_list = [pd.read_csv(dataPath + '109_' + month2num[fn[:-4]] + '.csv', index_col= 0, delimiter=',') for fn in filenameList] 
df_all = pd.concat(df_list)
df_all.to_csv(dataPath + '109.csv')