# README
## Options:
### Pretrained model:
- word2vec-ruscorpora-300	
- glove-twitter-200

### Tokenization: How to manage OOV?
- Link: '&'
- Tag: '#'
- Mention: '@' 

In [47]:
import copy
import pandas as pd
import numpy as np
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import SnowballStemmer

import gensim
import gensim.models.word2vec as w2v
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import AdamW, get_linear_schedule_with_warmup

from fetchData import fetchdata, cv_events
import __MLP
import __Preprocessing
# from __MLP import getSamplers, convert_df_to_unsqueezed_tensor, train_sequential, clf_report
import random
import emoji


# from __Preprocess import *

In [2]:
pd.set_option('display.max_rows', 400)
# pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 400)

In [3]:
import gensim.downloader as api
model = api.load('glove-twitter-200')

# ! Functions

In [48]:
""" Replaces contractions from a string to their equivalents """
def replaceContraction(text):
    contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                            (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

def getTokenization(raw_data):

    lmt = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    freqdist = nltk.FreqDist()
    tweet_tokenizer = TweetTokenizer()
    tweet_tokens = []
    stop_words = set(stopwords.words('english'))

    for sent in raw_data.text:

        # 기존
        sent = replaceContraction(sent)
        sent = emoji.demojize(sent)
        sent = re.sub(r':[^:\s]*:', r' \g<0>', text)  # http link -> '*'
        sent = re.sub(r"(www\S+)|(http\S+)", "HTTPURL", sent)
        sent = re.sub(r"[A-Za-z0-9]+", "@USER", sent)
        sent = re.sub(r"(#)(\S+)", r'\1 \2', sent)
        sent = re.sub(r'([^\s\w@#&]|_)+', '', sent)
        sent = re.sub(r"(\w+@\w+.[\w+]{2,4})", "<email>", sent)   
        sent = re.sub(r"\d+(\%|\s\bpercent\b)", "<percentage>", sent)   # mention -> '@'
        sent = re.sub(r"([0-1][0-9]|[2][0-3])[:|.|h]([0-5][0-9])", "<time>", sent)   # mention -> '@'
        sent = re.sub(r"([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}", "<date>", sent)  
        sent = re.sub(r"(((0)[0-9])|((1)[0-2]))(\/)([0-2][0-9]|(3)[0-1])(\/)\d{4}", "<date>", sent)  
        sent = re.sub(r"\d{4}(\/)(((0)[0-9])|((1)[0-2]))(\/)([0-2][0-9]|(3)[0-1])", "<date>", sent)  
        sent = re.sub(r'\s+', ' ', sent).strip()

        # sent = re.sub('@[^\s]+','atUser',sent)
        # sent = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',sent)

        # sent = re.sub('', '', sent.lower())
        # sent = [tweet_tokenizer.tokenize(sent)]
        sent = tweet_tokenizer.tokenize(sent.lower())
        # sent = [stemmer.stem(token) for token in sent]
        # sent = [lmt.lemmatize(token) for token in sent]

        temp = [token for token in sent if not token in stop_words]
        tweet_tokens.append([temp])
        # tweet_tokens.append(tweet_tokenizer.tokenize(sent))
    df_tokens = pd.DataFrame(tweet_tokens, columns=['token'])
    return df_tokens

def getTokenization_less(raw_data):

    lmt = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    freqdist = nltk.FreqDist()
    tweet_tokenizer = TweetTokenizer()
    tweet_tokens = []
    stop_words = set(stopwords.words('english'))

    for sent in raw_data.text:

        # 기존
        sent = re.sub(r"(www\S+)|(http\S+)", "*", sent)
        sent = re.sub(r"@\S+", "@", sent)
        sent = re.sub(r"(#)(\S+)", r'\1 \2', sent)
        sent = re.sub(r'([^\s\w@#&]|_)+', '', sent)

        # sent = re.sub('@[^\s]+','atUser',sent)
        # sent = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',sent)

        # sent = re.sub('', '', sent.lower())
        # sent = [tweet_tokenizer.tokenize(sent)]
        sent = replaceContraction(sent)
        sent = tweet_tokenizer.tokenize(sent.lower())
        # sent = [stemmer.stem(token) for token in sent]
        # sent = [lmt.lemmatize(token) for token in sent]

        temp = [token for token in sent if not token in stop_words]
        tweet_tokens.append([temp])
        # tweet_tokens.append(tweet_tokenizer.tokenize(sent))
    df_tokens = pd.DataFrame(tweet_tokens, columns=['token'])
    return df_tokens

def getObjectW2V(model):
    w2v_object = model.wv
    w2v_vectors = w2v_object.vectors # here you load vectors for each word in your model
    w2v_indices = {word: w2v_object.vocab[word].index for word in w2v_object.vocab} # here you load indices - with whom you can find an index of the particular word in your model 
    return w2v_object, w2v_vectors, w2v_indices

def get_W2V_AVG(df_tokens):
    import copy
    df_tokens['token_vec'] = copy.deepcopy(df_tokens['token'])

    for index, sent in enumerate(df_tokens['token_vec']):
        df_tokens['token_vec'][index] = vectorize(sent).mean(axis=0)

    df_temp = pd.DataFrame(
        df_tokens['token_vec'].values.tolist()).add_prefix('vec_avg')

    # df_tokens = df_tokens.join(df_temp).drop('token_vec', axis=1)
    df_temp = pd.DataFrame(df_tokens['token_vec'].values.tolist()).add_prefix('vec_avg')
    df_tokens = df_tokens.join(df_temp).drop('token_vec', axis=1)
    # df_test.drop('text_token_vec',axis=1, inplace=True)

    return df_tokens
    # return pd.DataFrame(df_tokens)

def vectorize(line):
    words = []
    for word in line:  # line - iterable, for example list of tokens
        try:
            w2v_idx = w2v_indices[word]
        except KeyError:  # if you does not have a vector for this word in your w2v model, continue
            words.append(list(np.zeros(200,)))
            continue
        words.append(list(w2v_vectors[w2v_idx]))
        if not word:
            words.append(None)

        if len(line) > len(words):
            continue
    return np.asarray(words)

# ! COVERSION: PHEME | PHEMEext | RHI

In [55]:
# raw_RHI = pd.read_csv("./data/_RHI_text.csv")
raw_PHEME = pd.read_csv("./data/_PHEME_text.csv")
raw_PHEMEext = pd.read_csv("./data/_PHEMEext_text.csv")

datasets = [raw_PHEME, raw_PHEMEext]

# for i, dataset in enumerate(datasets): datasets[i] = getTokenization(dataset)
datasets = [getTokenization_less(dataset) for dataset in datasets]
# datasets = [get_W2V_AVG(dataset) for dataset in datasets]
# w2v_object, w2v_vectors, w2v_indices = getObjectW2V(model)

In [56]:
w2v_object, w2v_vectors, w2v_indices = getObjectW2V(model)
datasets = [get_W2V_AVG(dataset) for dataset in datasets]

  w2v_object = model.wv


In [59]:
# datasets[0].to_csv('./data/_RHI_text_AVGw2v.csv', index = False)
datasets[0].drop(['token'],axis=1).to_csv('./data/_PHEME_text_AVGw2v_final.csv', index = False)
datasets[1].drop(['token'],axis=1).to_csv('./data/_PHEMEext_text_AVGw2v_final.csv', index = False)

In [62]:
datasets[0].drop(['token'],axis=1)

Unnamed: 0,vec_avg0,vec_avg1,vec_avg2,vec_avg3,vec_avg4,vec_avg5,vec_avg6,vec_avg7,vec_avg8,vec_avg9,...,vec_avg190,vec_avg191,vec_avg192,vec_avg193,vec_avg194,vec_avg195,vec_avg196,vec_avg197,vec_avg198,vec_avg199
0,-0.214452,0.022334,0.199403,0.091531,-0.106364,-0.135615,0.224626,-0.100319,0.156632,-0.080552,...,0.113106,-0.158723,0.046337,-0.102808,-0.085297,-0.109255,0.093852,-0.004668,0.290299,-0.042824
1,-0.195160,-0.031767,-0.221398,0.100288,-0.015177,-0.042167,0.013799,0.088592,0.268327,0.028013,...,-0.074119,-0.061033,-0.049259,0.049295,-0.116513,-0.075842,-0.245682,-0.123421,0.156163,-0.033397
2,-0.169584,-0.189962,-0.024939,0.014669,-0.005484,-0.139320,-0.007687,0.075244,0.258728,-0.223240,...,0.041829,-0.194850,-0.052420,0.098689,-0.093977,-0.054104,0.008628,-0.100813,0.131512,-0.046161
3,-0.066611,-0.020600,-0.276818,0.173542,0.086674,0.009584,0.154459,-0.003033,0.392480,-0.100663,...,0.236469,-0.103207,0.005035,-0.016566,-0.165597,-0.128446,-0.253505,-0.236819,0.127593,-0.069560
4,-0.155211,-0.274652,-0.147827,0.243690,0.092317,-0.179275,-0.047098,-0.082695,0.426767,-0.086420,...,0.243678,-0.092047,-0.154122,0.105979,-0.174003,-0.064574,0.005597,-0.458174,0.126540,-0.152261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.106938,0.010202,0.232050,-0.164897,-0.209751,-0.040170,0.197595,-0.315733,0.216575,0.135785,...,0.043007,-0.120709,-0.028817,-0.140666,0.038503,-0.039655,0.187898,-0.196423,0.054202,0.123615
5798,0.046390,-0.021311,0.055554,-0.253835,0.038298,-0.210279,0.158166,-0.008340,-0.064945,0.061531,...,-0.086546,0.007143,-0.023466,-0.198655,0.108289,0.072826,-0.123237,-0.229534,0.018978,0.125586
5799,0.136621,0.153237,0.028462,-0.117230,-0.080547,-0.097569,0.462077,-0.077477,-0.013268,0.070316,...,0.142870,-0.085517,0.168641,-0.129973,0.029105,0.072826,0.168497,0.070344,0.068108,-0.116517
5800,-0.119541,-0.062041,-0.067776,0.133035,-0.006523,-0.197435,-0.374273,-0.226611,0.016173,-0.203868,...,0.495836,-0.286977,0.104145,-0.171063,0.058746,-0.203296,0.106747,0.067551,0.144041,0.038614


# 알아보기

In [16]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# stop_words

In [158]:
raw_data = pd.read_csv("./data/_PHEME_text.csv")
raw_RHI = pd.read_csv("./data/_RHI_text.csv")

lmt = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
freqdist = nltk.FreqDist()
tweet_tokenizer = TweetTokenizer()
tweet_tokens = []
stop_words = set(stopwords.words('english'))

""" Replaces contractions from a string to their equivalents """
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

for sent in raw_data.text:

    sent = re.sub(r"http\S+", "&", sent)
    # sent = re.sub(r"@\S+", "@", sent)
    sent = re.sub(r"(#)(\S+)", r'\1 \2', sent)

    sent = re.sub(r'([^\s\w@#&]|_)+', '', sent)
    sent = re.sub('@[^\s]+','atUser',sent)
    # sent = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',sent)
    # sent = re.sub(r'#([^\s]+)', r'\1', sent)

    sent = replaceContraction(sent)

    # sent = re.sub('', '', sent.lower())
    # print(tweet_tokenizer.tokenize(sent))
    # sent = [tweet_tokenizer.tokenize(sent)]
    sent = tweet_tokenizer.tokenize(sent.lower())
    sent = [stemmer.stem(token) for token in sent]
    # sent = [lmt.lemmatize(token) for token in sent]

    temp = [token for token in sent if not token in stop_words]
    tweet_tokens.append([temp])
    # tweet_tokens.append(tweet_tokenizer.tokenize(sent))
df_tokens = pd.DataFrame(tweet_tokens, columns=['token'])

## Import data

In [219]:
raw_data = pd.read_csv("./data/_PHEME_text.csv")

In [9]:
print(raw_data.shape)
raw_data.sample(5)

(5802, 2)


Unnamed: 0,text,Event
1080,"#ICYMI, Christopher Hitchens on the case for m...",charliehebdo
4120,DEVELOPING NEWS: Soldier shot at War Memorial....,ottawashooting
5004,RT @tomsteinfort: Terrifying photo of hostages...,sydneysiege
2305,#Ferguson chief said the officer was unaware o...,ferguson
4898,BREAKING: 2 people have run out of Sydney buil...,sydneysiege


## Preprocessing

## Tokenization

In [243]:
tweet_tokenizer = TweetTokenizer()
tweet_tokens = []

for sent in raw_data.text:
    sent = re.sub(r"http\S+", "&", sent)
    sent = re.sub(r"@\S+", "@", sent)
    sent = re.sub(r"#\S+", "#", sent)
    sent = re.sub(r'([^\s\w@#&]|_)+','', sent)
    sent = re.sub('','', sent.lower())
    # print(tweet_tokenizer.tokenize(sent))
    sent = [tweet_tokenizer.tokenize(sent)]
    # sent = [tweet_tokenizer.tokenize(sent.lower())]
    tweet_tokens.append(sent)
    # tweet_tokens.append(tweet_tokenizer.tokenize(sent))
df_tokens = pd.DataFrame(tweet_tokens, columns=['token'])


In [237]:
df_tokens.head()

Unnamed: 0,token
0,"[breaking, armed, man, takes, hostage, in, kosher, grocery, east, of, paris, &]"
1,"[#, killers, dead, confirmed, by, gendarmerie]"
2,"[top, french, cartoonists, charb, cabu, wolinski, tignous, confirmed, among, dead, in, #, #, attack, editor, is, critically, wounded]"
3,"[police, have, surrounded, the, area, where, the, #, attack, suspects, are, believed, to, be, &, &]"
4,"[photo, armed, gunmen, face, police, officers, near, #, hq, in, paris, &, &]"


## Word2Vec

### Downloading Pretrained model for Gensim

### Fetching pretrained Model and Convert the Raw Text

In [230]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
# corpus = api.load('text8')
# wv = api.load('word2vec-google-news-300')
# fasttext-wiki-news-subwords-300'
#  'glove-twitter-200',
model = api.load('glove-twitter-200')

In [20]:
# import inspect
# print(inspect.getsource(wv.__class__))

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fa0739797f0>

In [42]:
w2v_object = model.wv
w2v_vectors = w2v_object.vectors # here you load vectors for each word in your model
w2v_indices = {word: w2v_object.vocab[word].index for word in w2v_object.vocab} # here you load indices - with whom you can find an index of the particular word in your model 

In [241]:
def vectorize(line): 
    words = []
    for word in line: # line - iterable, for example list of tokens 
        try:
            w2v_idx = w2v_indices[word]
        except KeyError: # if you does not have a vector for this word in your w2v model, continue 
            words.append(list(np.zeros(200,)))
            continue
        words.append(list(w2v_vectors[w2v_idx]))
        if not word:
            words.append(None)

        if len(line) > len(words):
            continue
    return np.asarray(words)

In [233]:
# print("Tweet 1: ", raw_data['text'][1])
# print("Indice of '{}': {}".format(df_tokens['token'][1][0], w2v_indices[df_tokens['token'][1][0]]))
# # print("Indice of '{}': {}".format(raw_data['text_token'][1][0], w2v_vectors[w2v_indices[raw_data['text_token'][1][0]]]))
# # print("Indice of '{}': {}".format(raw_data['text_token'][1][1], w2v_indices[raw_data['text_token'][1][1]]))
# # print("Indice of '{}': {}".format(raw_data['text_token'][1][1], w2v_vectors[w2v_indices[raw_data['text_token'][1][1]]]))
# # print("\nVector of the first headline:\n", vectorize(raw_data['text_token'][1]))

In [244]:
import copy
df_tokens['token_vec'] = copy.deepcopy(df_tokens['token'])

for index, sent in enumerate(df_tokens['token_vec']):
    df_tokens['token_vec'][index] = vectorize(sent).mean(axis=0)

# df_test[['text_token','text_token_vec']].head()

df_temp = pd.DataFrame(df_tokens['token_vec'].values.tolist()).add_prefix('vec_avg')

df_tokens = df_tokens.join(df_temp).drop('token_vec',axis=1)
# df_test.drop('text_token_vec',axis=1, inplace=True)

In [1]:
df_tokens.head()

NameError: name 'df_tokens' is not defined