In [1]:
from snownlp import SnowNLP
import pandas as pd
import time
from datetime import timedelta, date, datetime
from dateutil.relativedelta import relativedelta

import pycantonese
from pycantonese.word_segmentation import Segmenter
from collections import Counter

### PYCANTONESE

In [2]:
%%time
total_segmented_word = list()
segmented_word = list()

#use the customized word list for the nlp
segmenter = Segmenter(max_word_length=8)

comment_context = ['係升返少少， 但仲未去到2019個價， 真係好癲','用左成年去跌，估唔到升得咁快，姐係其實大家真係好多錢，睇下買唔買然而#hehe# 邊到有移民潮走資潮']

for idx, context in enumerate(comment_context):
    #segment the sentence
    temp_result = pycantonese.segment(context, cls=segmenter)
    #temp_result = pycantonese.pos_tag(temp_result)
    print("Orginal:\n",context)
    print("\nResult:\n",temp_result)
    print('='*50)
'''
    for word, pos in temp_result:
        if pos == 'ADJ' or pos == 'NOUN' or pos == 'VERB':
'''

Orginal:
 係升返少少， 但仲未去到2019個價， 真係好癲

Result:
 ['係', '升', '返', '少少', '，', '但', '仲未', '去到', '2019', '個', '價', '，', '真係', '好', '癲']
Orginal:
 用左成年去跌，估唔到升得咁快，姐係其實大家真係好多錢，睇下買唔買然而#hehe# 邊到有移民潮走資潮

Result:
 ['用', '左', '成年', '去', '跌', '，', '估', '唔', '到', '升', '得', '咁快', '，', '姐', '係', '其實', '大家', '真係', '好多', '錢', '，', '睇下', '買', '唔', '買', '然而', '#', 'hehe', '#', '邊', '到', '有', '移民潮', '走資', '潮']
Wall time: 3.51 s


"\n    for word, pos in temp_result:\n        if pos == 'ADJ' or pos == 'NOUN' or pos == 'VERB':\n"

### SnowNLP

In [3]:
for idx, context in enumerate(comment_context):
    s = SnowNLP(context)
    word = s.words
    print("Orginal:\n",context)
    print("\nResult:\n",word)
    print("="*50)

Orginal:
 係升返少少， 但仲未去到2019個價， 真係好癲

Result:
 ['係升', '返少', '少', '，', '但', '仲', '未', '去', '到', '2019', '個', '價', '，', '真', '係', '好', '癲']
Orginal:
 用左成年去跌，估唔到升得咁快，姐係其實大家真係好多錢，睇下買唔買然而#hehe# 邊到有移民潮走資潮

Result:
 ['用', '左', '成年', '去跌', '，', '估', '唔', '到', '升得', '咁快', '，', '姐', '係', '其', '實', '大', '家', '真', '係', '好', '多', '錢', '，', '睇', '下', '買', '唔', '買', '然而', '#hehe#', '邊到', '有', '移民', '潮', '走', '資潮']


In [4]:
#Over 50 = positive lower than 50 = negative 
for idx, context in enumerate(comment_context):
    s = SnowNLP(context)
    sentiments = s.sentiments
    print("Orginal:\n",context)
    print("\nResult:\n",sentiments)
    print('='*50)

Orginal:
 係升返少少， 但仲未去到2019個價， 真係好癲

Result:
 0.743106931949525
Orginal:
 用左成年去跌，估唔到升得咁快，姐係其實大家真係好多錢，睇下買唔買然而#hehe# 邊到有移民潮走資潮

Result:
 0.9898619250983283


### LIHKG Title with SnowNLP Sentiment

In [5]:
df_thread_log = pd.read_excel('LIHKG_Thread_Log.xlsx')
threads_title = df_thread_log["Thread Title"].values.tolist()

In [6]:
#Over 50 = positive lower than 50 = negative 
for idx, title in enumerate(threads_title):
    s = SnowNLP(title)
    sentiments = s.sentiments
    print("Orginal:\n",title)
    print("\nResult:\n",sentiments)
    print('='*50)

Orginal:
 viu邊度黃呀？mirror邊度黃呀？反tvb就叫黃？鬼打鬼渣喎，你咪又係撐緊藍？

Result:
 0.990017213970043
Orginal:
 中東阿布扎比銀行考慮收購渣打

Result:
 0.8246773465949826
Orginal:
 渣打定期5.25

Result:
 0.16849244223532023
Orginal:
 邊間渣打多靚女推介

Result:
 0.5399458887330778
Orginal:
 渣打比人盜用左條數係唔係要找??

Result:
 0.8093406835005924
Orginal:
 渣打：比特幣將暴跌至5000美元，金價衝上3000美元

Result:
 0.4966543571168609
Orginal:
 「整渣打網上銀行手機Apps嘅工程師麻煩入一入嚟」

Result:
 0.006163266344170082
Orginal:
 渣打拎外幣方法

Result:
 0.06543543085590331
Orginal:
 渣打幾時肯出大promo吸新客 (me)

Result:
 0.12583426378409657
Orginal:
 渣打個電話究竟打得通未

Result:
 0.05285050467990837
Orginal:
 屌你老母渣打

Result:
 0.44767845850986676
Orginal:
 突發！張渣打信用卡俾人碌左萬鳩幾蚊！！

Result:
 0.10748516628821925
Orginal:
 渣打：沒有證據顯示本行出現個人資料外洩 下周開始為受影響客戶進行退款

Result:
 0.0028375078484494942
Orginal:
 渣打：近日信用卡可疑交易事件中，只有1%帳戶受影響，下星期一起陸續安排退款，並無證據顯示銀行出現個人資料外洩

Result:
 3.7325517942998765e-05
Orginal:
 點解渣打ATM入錢冇得即時更新信用卡餘額

Result:
 0.008894991232104132
Orginal:
 渣打：客戶信用卡被盜用毋須責任  呼籲不用急於致電查詢

Result:
 0.0033945384556448577
Org