In [2]:
import numpy as np
import pandas as pd

import string
import re
import html
import urllib.request
import json
import os
import pickle

import nltk
from nltk.tokenize import word_tokenize
import anago

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import seaborn as sn

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
data = pd.read_csv("dataset/tweets.csv")
data.head()

Unnamed: 0,user_id,tweet,label
0,66378098,[HARI PEREMPUAN INTERNASIONAL]\n\nHari Perempu...,0
1,66378098,Himasika mengucapkan selamat hari raya idul Ad...,0
2,66378098,[COMING SOON]\n\nHalo mahasiswa fisika!\nMusya...,0
3,66378098,[PENDAFTARAN BMS 2018 DIBUKA]\n\nPendaftaran B...,0
4,66378098,Persiapan hari ke 2 OKKBK Fisika ITS semangat ...,0


In [265]:
def clean_text_ner(text):
    text = html.unescape(text)
    text = re.sub(r'#[A-Za-z0-9]+', ' ', text)
    text = re.sub(r'\w+:\/\/\S+', ' ', text)
    
    text = re.sub(r' +', ' ', text)
    text = os.linesep.join([s for s in text.splitlines() if s])
    text = ', '.join(text.split('\n'))
    
    return text

In [267]:
def replace_slang(text, acronym):
    res = []
    
    for w in text.split(' '):
        if w in acronym:
            res.append(acronym[w])
        else:
            res.append(w)
    
    return ' '.join(res)

In [268]:
with open("extra/akronim.json", "r") as f:    
    acronym_map = json.load(f)

In [269]:
data['tweet_ner'] = data['tweet'].apply(lambda x: clean_text_ner(x))
data['tweet_ner'] = data['tweet_ner'].apply(lambda x: replace_slang(x, acronym_map))

#### POS Tagger

https://github.com/mrrizal/POS_Tag_Indonesian

In [271]:
file = open('pos-tag-indonesian/indonesian_ngram_pos_tag.pickle', 'rb')
ngram_tagger = pickle.load(file)
file.close()

In [272]:
isascii = lambda s: len(s) == len(s.encode())

In [312]:
def pos_tag(text):
    global ngram_tagger
    
    words = []
    tags = []
    
    res_tag = ngram_tagger.tag(word_tokenize(text))
    for x in res_tag:
        if x[0] in string.punctuation:
            words.append(x[0])
            tags.append('Z')
        elif not isascii(x[0]):
            words.append(x[0])
            tags.append('EMO')
        else:
            words.append(x[0])
            tags.append(x[1])
            
    return words, tags

In [313]:
places = []
with open('extra/Gazetteer.txt') as f:
    lines = f.readlines()
    for line in lines:
        places.append(line[:-1])

In [314]:
def match_gazetteer(text):
    global places
    
    place, score = process.extractOne(text, places, scorer=fuzz.token_sort_ratio)
    
    return score

In [371]:
def get_string_position(text, query):
    s1 = '<ENAMEX TYPE="">'
    s2 = '</ENAMEX>'
    pos = text.find(query)
    new_text = text[:pos] + s1 + text[pos:pos+len(s1)] + s2 + text[pos+len(s1):]
    
    return new_text

In [376]:
def get_ner_label(text, tweet):
    words, tags = pos_tag(text)
    idx_s = 0
    idx_e = 0
    
    cur_s = 0
    cur_e = 0
    cur_score = 0

    for i in range(len(tags)):
        if(tags[i] == 'NN' or tags[i] == 'NNP'):
            if(i == 0):
                idx_s = 0
                idx_e = 0
            elif(tags[i-1] == 'NN' or tags[i-1] == 'NNP'):
                idx_e = i
            else:
                idx_s = i
                idx_e = i
        elif(i != 0 and (tags[i-1] == 'NN' or tags[i-1] == 'NNP')):
            if(idx_e - idx_s + 1 > 1):
                query = ' '.join(words[idx_s:idx_e+1])
                tweet = get_string_position(tweet, query)
#                 score = match_gazetteer(query)
#                 if score > cur_score:
#                     cur_score = score
#                     cur_s = idx_s
#                     cur_e = idx_e
        
        if((i == len(tags)-1) and (tags[i] == 'NN' or tags[i] == 'NNP')):
            if(idx_e - idx_s + 1 > 1):
                query = ' '.join(words[idx_s:idx_e+1])
                tweet = get_string_position(tweet, query)
#                 score = match_gazetteer(query)
#                 if score > cur_score:
#                     cur_score = score
#                     cur_s = idx_s
#                     cur_e = idx_e

        
#     for x in range(cur_s, cur_e+1):
#         tags[x] = "LOC"
    
#     for i in range(len(tags)):
#         if(tags[i] == 'LOC'):
#             tags[i] = 'B-LOC'
#         else:
#             tags[i] = 'O'
    
    return tweet

In [377]:
new_text = get_ner_label(data[data['label'] == 1].iloc[0]['tweet_ner'], data[data['label'] == 1].iloc[0]['tweet'])

In [333]:
data_ner_x = []
data_ner_y = []

In [380]:
with open("Enamex.txt", "w") as f:
    for idx, row in data[data['label'] == 1].iterrows():
        new_text = get_ner_label(row['tweet_ner'], row['tweet'])
        f.write(new_text + '\n-----------------\n')

In [328]:
data_ner_y[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [5]:
data[data['label'] == 1].iloc[10]['tweet']

'Bakor Pemandu ITS mengundang Pemandu Aktif ITS untuk duduk bareng ngobrolin LKMM TD pada:\n\n📆 Selasa - Rabu, 20-21 Feruari\n2018\n🕛 18.00 - 21.30 WIB\n📍 SCC Lt. 3\n👔 Standar Kuliah\n\n"Raise your standards to create change!" - An Iota of Truth\n\n#OborBakor\n#BAKORITS\n#ITSSurabaya https://t.co/v6oFZcWLFv'