# References
* https://commons.wikimedia.org/wiki/File:ASCII-Table-wide.svg
* https://en.wikipedia.org/wiki/Thai_(Unicode_block)

# Test Thai phone generator

In [1]:
import rstr
print(
    rstr.xeger(r'(0(-)?2\d{3}(-)?\d{4})'),
    rstr.xeger(r'(0(-)?[3-9]\d{3}(-)?\d{4})')
)

0-2664-8023 054999440


# Generate full dataset

In [2]:
import numpy as np
import pandas as pd
import re
from faker import Faker


def my_normalize(probs):
    prob_factor = 1 / sum(probs)
    return [prob_factor * p for p in probs]

def my_random_choice(c={'a':0.6,'b':0.4},k=5):
    p=np.array([v for k,v in c.items()])
    p=my_normalize(p)
    a=np.array([k for k,v in c.items()])
    x=np.random.choice(a=a, size=k, p=list(p))
    return x

def my_fake_phone(k=5):
    reg=r'(0(-)?2\d{3}(-)?\d{3,5})|(0(-)?[3-9]\d{3}(-)?\d{3,5})'
    return [rstr.xeger(reg) for i in range(k)]

n=1000
c={
    'กลาง':0.5,
    'ภาคกลาง':0.5,
    'ภาคกลาง ':0.5,
    'เหนือ':0.5,
    'ภาคเหนือ':0.5,
    'ใต้':0.5,
    'ภาคใต้ ':0.5,
    'ตะวันออก':0.5,
    'ภาคตะวันออก':0.5,
    'ภาคตะวันออก ':0.5,
    'ตะวันตก':0.5,
    'ตะวันตก\u000d':0.5,
    'ตะวันตก\u000a':0.5,
    'ตะวันตก\u000d\u000a':0.5,
    'ตะวันออกเฉียงเหนือ':0.5,
    'ภาคตะวันออกเฉียงเหนือ':0.5,
    '':0.5
}
f = Faker()
d={
    'email':[f.company_email() for i in range(n)],
    'area':my_random_choice(c,n),
    'phone':my_fake_phone(n),
    'date':[f.date_between(start_date='-5y',end_date='-1d') for i in range(n)]
}
dt={
    'email':'string',
    'area':'string',
    'phone':'string',
    'date':'string',
}
df=pd.DataFrame(d)
# df.astype(dt)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   1000 non-null   object
 1   area    1000 non-null   object
 2   phone   1000 non-null   object
 3   date    1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


Unnamed: 0,email,area,phone,date
0,victoriameyer@watts.com,,0-5791-437,2018-10-12
1,leslierodriguez@miller-davenport.net,กลาง,02477-13974,2019-07-05
2,hscott@duran.com,ตะวันตก\n,091617714,2021-01-11
3,owilson@cochran-montgomery.com,ภาคเหนือ,0-9798-57785,2019-02-12
4,ryan15@morrison.biz,ภาคกลาง,07440-312,2020-04-15
...,...,...,...,...
995,barnesstephen@hodges.com,ตะวันออก,0-2188104,2019-11-16
996,daniel60@taylor.com,ใต้,0-20247303,2019-10-31
997,uturner@jones.com,กลาง,07520-114,2020-09-23
998,sullivankimberly@perkins.com,ตะวันออก,02172-065,2019-03-24


# Adding some noise

In [3]:
shape=df.shape
mask=np.random.choice([True, False], size=shape, p=[0.1, 0.9])
df2=df
df2[mask]=[rstr.xeger(r'\w{0,20}') for i in range(mask.sum())]
df2

Unnamed: 0,email,area,phone,date
0,v96,,0-5791-437,2018-10-12
1,leslierodriguez@miller-davenport.net,E2,02477-13974,2019-07-05
2,hscott@duran.com,ตะวันตก\n,091617714,2021-01-11
3,owilson@cochran-montgomery.com,ภาคเหนือ,0-9798-57785,2019-02-12
4,ryan15@morrison.biz,ภาคกลาง,07440-312,mkc5K1v45
...,...,...,...,...
995,barnesstephen@hodges.com,ตะวันออก,0-2188104,2019-11-16
996,daniel60@taylor.com,ใต้,0-20247303,2019-10-31
997,uturner@jones.com,กลาง,07520-114,2reBGNs
998,sullivankimberly@perkins.com,ตะวันออก,02172-065,2019-03-24


# Saving to file

In [4]:
fname='regex_data.xlsx'
df.to_excel(fname)
df =pd.read_excel(fname)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   email       994 non-null    object
 2   area        934 non-null    object
 3   phone       994 non-null    object
 4   date        997 non-null    object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


Unnamed: 0.1,Unnamed: 0,email,area,phone,date
0,0,v96,,0-5791-437,2018-10-12 00:00:00
1,1,leslierodriguez@miller-davenport.net,E2,02477-13974,2019-07-05 00:00:00
2,2,hscott@duran.com,ตะวันตก\n,091617714,2021-01-11 00:00:00
3,3,owilson@cochran-montgomery.com,ภาคเหนือ,0-9798-57785,2019-02-12 00:00:00
4,4,ryan15@morrison.biz,ภาคกลาง,07440-312,mkc5K1v45
...,...,...,...,...,...
995,995,barnesstephen@hodges.com,ตะวันออก,0-2188104,2019-11-16 00:00:00
996,996,daniel60@taylor.com,ใต้,0-20247303,2019-10-31 00:00:00
997,997,uturner@jones.com,กลาง,07520-114,2reBGNs
998,998,sullivankimberly@perkins.com,ตะวันออก,02172-065,2019-03-24 00:00:00
