## Parse date

In [None]:
from datetime import datetime
from dateutil import tz
import re

def parse_date(date):
    if len(date.split('.')) > 1 :
        dates = date.split(' ')
        dates[1] = re.sub("\D", "", dates[1])
        dates[3] = dates[3].split('.')[0]
        date = " ".join(dates)
        date = datetime.strptime(date, '%B %d %Y, %H:%M:%S')
    else:
        date = date[date.find(" ")+1:]
        date = datetime.strptime(date, '%b %d %H:%M:%S %Y')

    tzinfo = tz.gettz('Asia/Taipei')
    return date.replace(tzinfo=tzinfo).isoformat()

dates = [
    'December 9th 2016, 22:24:35.000',
    'Wed Oct 5 20:53:22 2016'
]
[parse_date(date) for date in dates]

## Parse author

In [19]:
author = 'assggy (IamCarmelo)'
author.split(" ")[0]

'assggy'

## Parse image_url

In [71]:
import requests
from bs4 import BeautifulSoup
import re
# url = 'https://www.ptt.cc/bbs/Beauty/M.1528654347.A.4DE.html'
url = 'https://www.ptt.cc/bbs/LoL/M.1528721007.A.53B.html'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')

In [73]:
def parse_img_url(main_content):
    rule = '(?:http\:|https\:)?\/\/.*\.(?:png|jpe?g|gif)'
    items = main_content.find_all('a')
    if items:
        for item in items:
            url = re.findall(rule, item['href'])
            if url:
                return url[0]


main_content = soup.find(id="main-content")
parse_img_url(main_content)

## Mapping (PTT)

In [75]:
from elasticsearch_dsl import connections, DocType, Date, Nested, InnerDoc, Keyword, Text, Ip, Integer
from datetime import datetime
# connections.create_connection(hosts=['13.78.14.166'])
connections.create_connection(hosts=['168.62.50.20'])

class Message(InnerDoc):
    push_tag = Keyword(ignore_above=256)
    push_userid = Keyword(ignore_above=256)
    push_content = Keyword(ignore_above=256)
    push_ipdatetime = Keyword(ignore_above=256)

class Article(DocType):
    article_id = Keyword(ignore_above=256)
    article_title = Text(
        analyzer='ik_max_word',
        search_analyzer = 'ik_max_word'
    )
    author = Keyword(ignore_above=256)
    board = Keyword(ignore_above=256)
    content = Text(
        analyzer='ik_max_word',
        search_analyzer = 'ik_max_word'
    )
    date = Keyword(ignore_above=256)
    
    messages = Nested(Message)
    message_all = Integer()
    message_count = Integer()
    message_controversial = Integer()
    message_push = Integer()
    message_boo = Integer()
    message_neutral = Integer()
    
    author_parsed = Keyword(ignore_above=256)
    img_url = Keyword(ignore_above=256)
    date_parsed = Date(default_timezone='Asia/Taipei')
    ip = Keyword(ignore_above=256)

    class Meta:
        index = 'ptt-2018-06'
        doc_type = 'article'

Article.init()
print('done')

## Query Example

In [None]:
# from elasticsearch_dsl import Search
# connections.create_connection(hosts=['13.78.14.166'])
# client = Elasticsearch()
# s = Search(index="test2-2018-06") \
#     .filter("term", category="search") \
#     .query("match", title="感情")   \
#     .exclude("match", description="父母")

# s.aggs.bucket('per_tag', 'terms', field='tags') \
#     .metric('max_lines', 'max', field='lines')
# response = s.execute()
# for hit in response:
#     print(hit.meta.score, hit.title)
# for tag in response.aggregations.per_tag.buckets:
#     print(tag.key, tag.max_lines.value)

In [22]:
from elasticsearch_dsl import Search, connections
connections.create_connection(hosts=['13.78.14.166'])
board = "Boy-Girl"
board = "Gossiping"

# Term 精确
# Match 模糊
s = Search(index="test-2018-06") \
    .filter("term", board=board) \
    .query("match", content="西瓜")   \
#     .query("match", article_title="西瓜 夏天")\
#     .exclude("match", article_title="照片")

response = s.execute()
for hit in response:
    print(hit.meta.score, hit.article_title)


11.233568 [新聞] 西瓜也有身份證了 後龍西瓜節週六熱鬧登
11.233568 [新聞] 西瓜也有身份證了 後龍西瓜節週六熱鬧登
11.217221 [問卦] 有紅西瓜的西瓜汁 為什麼沒小玉西瓜汁？
11.177635 [新聞] 〈中部〉福興西瓜節週日登場 三選將齊促
11.175111 [問卦] 有紅西瓜的西瓜汁 為什麼沒小玉西瓜汁？
11.174166 Re: [問卦] 溪州有什麼特產啊
11.173001 Re: [問卦] 溪州有什麼特產啊
11.14826 [新聞] 〈中部〉福興西瓜節週日登場 三選將齊促
11.067176 Re: [新聞] 【選情初探】民進黨：攻下台北、新北　就
11.065167 Re: [新聞] 【選情初探】民進黨：攻下台北、新北　就


In [27]:
from elasticsearch_dsl import Search, connections
connections.create_connection(hosts=['13.78.14.166'])
# board = "Gossiping"
board = "Boy-Girl"
s = Search(index="ptt-2018-06") \
    .filter("term", board=board) \
    .query("match", article_title="哥哥")  \
#     .query("match", content="西瓜 夏天")\
#     .exclude("match", article_title="照片")

response = s.execute()
for hit in response:
    print(hit.message_count.count, hit.article_title, hit.author, hit.meta.score,  hit.date_parsed)

In [28]:
from elasticsearch_dsl import Search, connections
connections.create_connection(hosts=['13.78.14.166'])
board = "Boy-Girl"
board = "Gossiping"

# Term 精确
# Match 模糊
s = Search(index="ptt-2018-06") \
    .filter("term", board=board) \
    .query("match", content="西瓜")   \
#     .query("match", article_title="西瓜 夏天")\
#     .exclude("match", article_title="照片")

response = s.execute()
for hit in response:
    print(hit.message_count.count, hit.article_title, hit.author, hit.meta.score,  hit.date_parsed)


9 [新聞] 西瓜也有身份證了 後龍西瓜節週六熱鬧登 qqq87112 (kaim) 12.909994 2018-06-05T15:20:50+08:00
0 Re: [問卦] 溪州有什麼特產啊 kutkin (  ) 12.744474 2018-06-06T11:00:54+08:00
19 [問卦] 西瓜的真實身份是？ ysc1213 (ysc) 12.708532 2018-01-23T10:45:04+08:00
8 [問卦] 有紅西瓜的西瓜汁 為什麼沒小玉西瓜汁？ q0000000 (十方眾生皆蒙昧 道消魔長) 12.691474 2018-05-13T10:44:13+08:00
4 [新聞] 〈中部〉福興西瓜節週日登場 三選將齊促 qqq87112 (kaim) 12.678584 2018-06-01T08:43:56+08:00
1 Re: [新聞] 【選情初探】民進黨：攻下台北、新北　就 sunyeah (   湯元嗎) 12.550674 2018-05-20T23:55:46+08:00
2 [問卦] 憾！小X百貨竟然沒賣西瓜刀？ Marzzze (Marzzze) 12.5249 2018-05-25T14:26:09+08:00
4 [新聞] 國宴西瓜行情佳　花警啟動「護瓜專案」 gjsjhang (臺灣杉－Biang) 12.500449 2018-05-10T13:48:46+08:00
3 [新聞] 馬英九叫賣西瓜 1顆1萬8800元助弱勢 cc9i (正直與善良) 12.471011 2018-05-19T18:26:17+08:00
3 [新聞] 猥褻無極限！爆乳女模開腿露下體破西瓜　 zuvupa (阿嘉) 12.466688 2018-04-26T15:02:44+08:00


## Board Binning

In [1]:
board_au = {
    'Gossiping': 14896,
    'NBA': 4938,
    'Stock': 2951,
    'C_Chat': 2755,
    'Baseball': 2609,
    'LoL': 1737,
    'sex': 1471,
    'WomenTalk': 1234,
    'MobileComm': 1211,
    'movie': 1079,
    'Boy-Girl': 1062,
    'marvel': 1035,
    'PlayStation': 1032,
    'BabyMother': 1005,
    'Hearthstone': 981,
    'car': 926,
    'Japan_Travel': 897,
    'Lifeismoney': 894,
    'Tech_Job': 693,
    'KoreaStar': 681,
    'KR_Entertain': 663,
    'Tennis': 636,
    'Beauty': 618,
    'marriage': 578,
    'ToS': 552,
    'e-shopping': 494,
    'Tainan': 470,
    'ONE_PIECE': 450,
    'home-sale': 437,
    'PC_Shopping': 434,
    'Kaohsiung': 429,
    'MakeUp': 411,
    'studyteacher': 388,
    'Steam': 369,
    'joke': 359,
    'NSwitch': 345,
    'StupidClown': 342,
    'BuyTogether': 320,
    'japanavgirls': 310,
    'Examination': 307,
    'Japandrama': 304,
    'KoreaDrama': 300,
    'AllTogether': 292,
    'Salary': 289,
    'iOS': 264,
    'creditcard': 263,
    'Elephants': 263,
    'PokemonGO': 258,
    'BeautySalon': 258,
    'MuscleBeach': 256,
    'HatePolitics': 235,
    'CFantasy': 232,
    'TaichungBun': 205,
    'CVS': 199,
    'SportLottery': 195,
    'HardwareSale': 193,
    'GetMarry': 190,
    'FATE_GO': 183,
    'EAseries': 182,
    'Option': 182,
    'Aviation': 180,
    'job': 180,
    'Palmar_Drama': 177,
    'WOW': 176,
    'biker': 175,
    'BTS': 173,
    'Hsinchu': 171,
    'TaiwanDrama': 171,
    'YuanChuang': 164,
    'part-time': 163,
    'TypeMoon': 162,
    'PuzzleDragon': 158,
    'Headphone': 155,
    'Gamesale': 154,
    'PathofExile': 153,
    'KanColle': 151,
    'FITNESS': 145,
    'MLB': 143,
    'Food': 141,
    'TW_Entertain': 141,
    'AC_In': 137,
    'cat': 137,
    'TWICE': 137,
    'RealmOfValor': 135,
    'DSLR': 134,
    'Soft_Job': 133,
    'basketballTW': 128,
    'Wanted': 124,
    'KoreanPop': 123,
    'LGBT_SEX': 118,
    'Finance': 117,
    'WorldCup': 117,
    'lesbian': 117,
    'GBF': 115,
    'HelpBuy': 114,
    'NBA_Film': 114,
    'mobilesales': 108,
    'CN_Entertain': 107,
    'IdolMaster': 105,
    'Teacher': 105,
    'Monkeys': 105,
    'MH': 98,
    'Zastrology': 98,
    'XBOX': 97,
    'CarShop': 96,
    'DMM_GAMES': 94,
    'TY_Research': 93,
    'cookclub': 93,
    'Isayama': 92,
    'Guardians': 92,
    'MacShop': 90,
    'G-S-WARRIORS': 89,
    'PublicServan': 87,
    'Gov_owned': 86,
    'DigiCurrency': 85,
    'feminine_sex': 84,
    'Lineage': 83,
    'nb-shopping': 82,
    'BabyProducts': 79,
    'Korea_Travel': 78,
    'AKB48': 78,
    'Shadowverse': 77,
    'FORMULA1': 75,
    'Bank_Service': 74,
    'DC_SALE': 73,
    'Oversea_Job': 71,
    'Railway': 71,
    'Brand': 71,
}

In [56]:
boards = []
size =  sum([v for k, v in board_au.items()]) // 8

buffers = []
buffer = 0
for k, v in board_au.items():
    buffer += v
    buffers.append(k)
    if buffer>size:
        boards.append(buffers)
        buffers = []
        buffer = 0

In [57]:
boards

[['Gossiping'],
 ['NBA', 'Stock', 'C_Chat'],
 ['Baseball', 'LoL', 'sex', 'WomenTalk', 'MobileComm'],
 ['movie',
  'Boy-Girl',
  'marvel',
  'PlayStation',
  'BabyMother',
  'Hearthstone',
  'car',
  'Japan_Travel',
  'Lifeismoney'],
 ['Tech_Job',
  'KoreaStar',
  'KR_Entertain',
  'Tennis',
  'Beauty',
  'marriage',
  'ToS',
  'e-shopping',
  'Tainan',
  'ONE_PIECE',
  'home-sale',
  'PC_Shopping',
  'Kaohsiung',
  'MakeUp',
  'studyteacher',
  'Steam'],
 ['joke',
  'NSwitch',
  'StupidClown',
  'BuyTogether',
  'japanavgirls',
  'Examination',
  'Japandrama',
  'KoreaDrama',
  'AllTogether',
  'Salary',
  'iOS',
  'creditcard',
  'Elephants',
  'PokemonGO',
  'BeautySalon',
  'MuscleBeach',
  'HatePolitics',
  'CFantasy',
  'TaichungBun',
  'CVS',
  'SportLottery',
  'HardwareSale',
  'GetMarry',
  'FATE_GO',
  'EAseries',
  'Option',
  'Aviation',
  'job',
  'Palmar_Drama',
  'WOW',
  'biker',
  'BTS',
  'Hsinchu',
  'TaiwanDrama',
  'YuanChuang']]