## 查看数据分布

# Twitter dataset

In [3]:
# 查看 twitter data 数据格式和分布:frequency、numbers、shape etc.

In [4]:
import numpy as np
import pandas as pd
import datetime

### data_path

In [2]:
load_path = './data/raw dataset'  # 相对路径，..表示上上级路径
result_path = './result'

In [23]:
# load dataset
p_part1 = load_path + '/68841_tweets_multiclasses_filtered_0722_part1.npy'
print(p_part1)
p_part2 = load_path + '/68841_tweets_multiclasses_filtered_0722_part2.npy'
# Python 中的 pickle 用于在保存到磁盘文件或从磁盘文件读取之前，对对象进行序列化和反序列化
df_np_part1 = np.load(p_part1, allow_pickle=True)  # allow_pickle, Allow loading pickled object arrays stored in npy files
df_np_part2 = np.load(p_part2, allow_pickle=True)
df = np.concatenate((df_np_part1, df_np_part2),axis=0) # 按行拼接
print('loaded data.')
df = pd.DataFrame(data=df, columns=['event_id','tweet_id','text','user_id','created_at','user_loc','place_type',
                                      'place_full_name','place_country_code','hashtags','user_mentions','image_urls',
                                      'entities','words','filtered_words','sampled_words'])
print('Data converted to dataframe.')

./data/raw dataset/68841_tweets_multiclasses_filtered_0722_part1.npy
loaded data.
Data converted to dataframe.


In [24]:
# sort date by time
df = df.sort_values(by='created_at').reset_index(drop=True)
# append date
df['date'] = [d.date() for d in df['created_at']]
# -------------------------------------------------------------------
init_day = df.loc[0, 'date']

In [25]:
print('total message numbers:', df.shape)   
print('total number of event classes:', df.event_id.nunique()) 

total message numbers: (68841, 17)
total number of event classes: 503


In [7]:
df.head(10)

Unnamed: 0,event_id,tweet_id,text,user_id,created_at,user_loc,place_type,place_full_name,place_country_code,hashtags,user_mentions,image_urls,entities,words,filtered_words,sampled_words,date
0,394,255819992157786112,HipHop awards bout to be live!!,250870763,2012-10-10 00:00:13,,,,,[],[],[],[],"[award, live, bout, hiphop]","[award, live, bout, hiphop]",[],2012-10-10
1,394,255820118095978496,HIPHOP AWARDS TIME!,28026779,2012-10-10 00:00:43,SoundCloud/RaRaSupaStar,,,,[],[],[],[],"[HIPHOP, AWARDS, time]","[hiphop, awards, time]",[],2012-10-10
2,394,255820147489636353,Bet hiphop awards,566825483,2012-10-10 00:00:50,,,,,[],[],[],"[(Bet, GPE)]","[award, bet, hiphop]","[award, bet, hiphop]",[],2012-10-10
3,394,255820164023595008,BET HipHop awards is on!!!,197834311,2012-10-10 00:00:54,Saint Lucia ☀️🌴🇱🇨,,,,[],[],[],[],"[HipHop, BET, award]","[hiphop, bet, award]",[],2012-10-10
4,394,255820180884701184,Watchin Da BET Hiphop Awards,439490861,2012-10-10 00:00:58,"Michigan, USA",,,,[],[],[],[],"[Hiphop, Watchin, Awards, Da, BET]","[hiphop, watchin, awards, da, bet]",[],2012-10-10
5,394,255820336002646016,Tuned Into The Hiphop Awards,197259280,2012-10-10 00:01:35,,,,,[],[],[],[],"[Awards, Hiphop, tune]","[awards, hiphop, tune]",[],2012-10-10
6,394,255820419943247872,Bout 2 tune into the hiphop awards,103100887,2012-10-10 00:01:55,,,,,[],[],[],"[(Bout 2, PERSON)]","[award, Bout, hiphop, tune]","[award, bout, hiphop, tune]",[],2012-10-10
7,394,255820487307964416,Hiphop awards on finna tune in,40153735,2012-10-10 00:02:11,,,,,[],[],[],[],"[award, finna, Hiphop, tune]","[award, finna, hiphop, tune]",[],2012-10-10
8,487,255820516613578754,Ready Too See My Nigga 2 Chainz Perform!,437229768,2012-10-10 00:02:18,"Lafayette, AL",,,,[],[],[],[],"[Perform, Ready, see, Nigga, Chainz]","[perform, ready, see, nigga, chainz]",[],2012-10-10
9,394,255820562629263360,#workhard !!!\n#wiz \n#BET HipHop Awards !!,600320911,2012-10-10 00:02:29,Philadelphia,,,,"[workhard, wiz, BET]",[],[],"[(#, CARDINAL), (#wiz, MONEY)]","[award, hiphop]","[award, hiphop]",[],2012-10-10


### ldavec test

In [9]:
df = df[(df['date']>= init_day + datetime.timedelta(days=0)) & (df['date']<= init_day + datetime.timedelta(days=1))].reset_index() # (11971, 18)
print(df.shape)
print(df.event_id.nunique())
print(df.user_id.nunique())

(4762, 18)
57
4355


In [12]:
import os
project_path = os.getcwd()
train_vec = project_path + '/baselines/proasmdatasetVec.txt.model'

In [27]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from gensim.utils import simple_preprocess
from smart_open import open

In [58]:
def process_doc(df):
    for line, event_id in zip(list(df.filtered_words), list(df.event_id)):
        print(line)
        yield TaggedDocument(words=line, tags=[event_id]) 

In [59]:
train_corpus = list(process_doc(df))

['award', 'live', 'bout', 'hiphop']
['hiphop', 'awards', 'time']
['award', 'bet', 'hiphop']
['hiphop', 'bet', 'award']
['hiphop', 'watchin', 'awards', 'da', 'bet']
['awards', 'hiphop', 'tune']
['award', 'bout', 'hiphop', 'tune']
['award', 'finna', 'hiphop', 'tune']
['perform', 'ready', 'see', 'nigga', 'chainz']
['award', 'hiphop']
['awards', 'bet', 'hiphop']
['guess', 'ill', 'awards', 'hiphop', 'tune']
['hiphop', 'nwbet', 'award']
['already', 'wit', 'watchin', 'bet', 'awards', 'live']
['weird', 'prize', 'physics', 'quantum', 'highlight', 'optic', 'world', 'nobel']
['award', 'hiphop', 'not', 'tonight', 'even', 'rap', 'much', 'come', 'know', 'really', 'do']
['bet', 'hiphop', 'awards']
['award', 'watchin', 'bet', 'ok']
['not', 'nas', 'year', 'lyricist', 'kendrick', 'lamar', 'holy', 'win', 'shit', 'do']
['award', 'be', 'bet', 'dyin', 'right']
['award', 'bet', 'deal']
['shrug', 'bet', 'awards', 'come', 'know', 'today']
['hiphop', 'bet', 'awards']
['award', 'watchin', 'da', 'bet']
['hiphop',

In [60]:
train_corpus

[TaggedDocument(words=['award', 'live', 'bout', 'hiphop'], tags=[394]),
 TaggedDocument(words=['hiphop', 'awards', 'time'], tags=[394]),
 TaggedDocument(words=['award', 'bet', 'hiphop'], tags=[394]),
 TaggedDocument(words=['hiphop', 'bet', 'award'], tags=[394]),
 TaggedDocument(words=['hiphop', 'watchin', 'awards', 'da', 'bet'], tags=[394]),
 TaggedDocument(words=['awards', 'hiphop', 'tune'], tags=[394]),
 TaggedDocument(words=['award', 'bout', 'hiphop', 'tune'], tags=[394]),
 TaggedDocument(words=['award', 'finna', 'hiphop', 'tune'], tags=[394]),
 TaggedDocument(words=['perform', 'ready', 'see', 'nigga', 'chainz'], tags=[487]),
 TaggedDocument(words=['award', 'hiphop'], tags=[394]),
 TaggedDocument(words=['awards', 'bet', 'hiphop'], tags=[394]),
 TaggedDocument(words=['guess', 'ill', 'awards', 'hiphop', 'tune'], tags=[394]),
 TaggedDocument(words=['hiphop', 'nwbet', 'award'], tags=[394]),
 TaggedDocument(words=['already', 'wit', 'watchin', 'bet', 'awards', 'live'], tags=[394]),
 Tagge

In [61]:
type(train_corpus)

list

In [62]:
def train(ftrain):
    # 实例化Doc2Vec模型
    model = Doc2Vec(vector_size=128, window=3, cbow_mean=1, min_count=1)
    # 更新现有的word2vec模型
    model.build_vocab(ftrain)  # 使用数据建立单词表
    model.train(ftrain, total_examples=model.corpus_count, epochs=30)  # 训练模型，更新模型参数
    model.save(train_vec)
    return model

In [63]:
model_dm = train(train_corpus)

## data_preprocess

In [8]:
# remove repeated messages and those associated with multiple event classes
# sort date by time
df = df.sort_values(by='created_at').reset_index(drop=True)
# append date
df['date'] = [d.date() for d in df['created_at']]

In [9]:
# remove repeated messages
df_duplicates = df.drop_duplicates(subset=['tweet_id'], keep='first')

In [10]:
print('duplicated total message numbers:', df_duplicates.shape)   
print('duplicated total number of event classes:', df_duplicates.event_id.nunique()) 

duplicated total message numbers: (68841, 17)
duplicated total number of event classes: 503


In [11]:
df_duplicates.tweet_id.head()

0    255819992157786112
1    255820118095978496
2    255820147489636353
3    255820164023595008
4    255820180884701184
Name: tweet_id, dtype: object

## df_extract

In [12]:
# df.shape

# df_count = df.event_id.value_counts().to_frame()
# df_count_2 = df_count[df_count.event_id >= 2].reset_index()
# df_count_2

# # extract and descend according to time
# df_extract = df[df.event_id.isin(df_count_2.index)]
# print(df_extract.shape)

# print(df_extract.event_id.nunique())

# # 全部显示
# pd.set_option('max_colwidth', None)
# df[df['event_id']==8].head()

# df_extract.loc['created_at'] = pd.to_datetime(df_extract['created_at'])
# df_time = df_extract.sort_values(by='created_at', ascending=False)
# df_time

# df_extract.info()

(68841, 17)

## df_offline

In [13]:
print(df.shape)
print(df.event_id.nunique())

(68841, 17)
503


In [14]:
df_extract = df

In [15]:
df.date.unique()

array([datetime.date(2012, 10, 10), datetime.date(2012, 10, 11),
       datetime.date(2012, 10, 12), datetime.date(2012, 10, 13),
       datetime.date(2012, 10, 14), datetime.date(2012, 10, 15),
       datetime.date(2012, 10, 16), datetime.date(2012, 10, 17),
       datetime.date(2012, 10, 18), datetime.date(2012, 10, 19),
       datetime.date(2012, 10, 20), datetime.date(2012, 10, 21),
       datetime.date(2012, 10, 22), datetime.date(2012, 10, 23),
       datetime.date(2012, 10, 24), datetime.date(2012, 10, 25),
       datetime.date(2012, 10, 26), datetime.date(2012, 10, 27),
       datetime.date(2012, 10, 28), datetime.date(2012, 10, 29),
       datetime.date(2012, 10, 30), datetime.date(2012, 10, 31),
       datetime.date(2012, 11, 1), datetime.date(2012, 11, 2),
       datetime.date(2012, 11, 3), datetime.date(2012, 11, 4),
       datetime.date(2012, 11, 5), datetime.date(2012, 11, 6),
       datetime.date(2012, 11, 7)], dtype=object)

In [16]:
# social message in the first week of offline experimental dataset
init_day = df.loc[0,'date']
# offline dataset
df_offline = df_extract[(df_extract['date']>= init_day) & (df_extract['date']<= init_day + datetime.timedelta(days=1))].reset_index()  # (11971, 18)

In [17]:
df_offline.date.unique()

array([datetime.date(2012, 10, 10), datetime.date(2012, 10, 11)],
      dtype=object)

In [18]:
print(df_offline.shape)   # (4762, 18)
print(df_offline.event_id.nunique())  # 57

(4762, 18)
57


In [19]:
init_day

datetime.date(2012, 10, 10)

## df_incremental

In [21]:
# social messages in 8th to 14th days
for i in range(1,8):
    day_label = str(i) + '-th day'
    print(day_label)
    df_messages = df_extract[df_extract['date'] == init_day + datetime.timedelta(days=i)]
    print('social messages in ' + day_label + ' date: ', df_messages.date.unique())
    print('social messages in ' + day_label + ' message numbers: ', df_messages.shape)   # (4762, 18)
    print('social messages in ' + day_label + ' evet_id numbers: ', df_messages.event_id.nunique())  # 57
    df_stream = df_extract[(df_extract['date']>= init_day + datetime.timedelta(days=i)) &
                        (df_extract['date']<= init_day + datetime.timedelta(days=1+i))].reset_index()
    print('social stream in ' + day_label + ' date: ', df_stream.date.unique())
    print('social stream in ' + day_label + ' message numbers: ', df_stream.shape)   # (4762, 18)
    print('social stream in ' + day_label + ' event_id numbers: ', df_stream.event_id.nunique())  # 57

1-th day
social messages in 1-th day date:  [datetime.date(2012, 10, 11)]
social messages in 1-th day message numbers:  (1812, 17)
social messages in 1-th day evet_id numbers:  41
social stream in 1-th day date:  [datetime.date(2012, 10, 11) datetime.date(2012, 10, 12)]
social stream in 1-th day message numbers:  (7258, 18)
social stream in 1-th day event_id numbers:  62
2-th day
social messages in 2-th day date:  [datetime.date(2012, 10, 12)]
social messages in 2-th day message numbers:  (5446, 17)
social messages in 2-th day evet_id numbers:  48
social stream in 2-th day date:  [datetime.date(2012, 10, 12) datetime.date(2012, 10, 13)]
social stream in 2-th day message numbers:  (7209, 18)
social stream in 2-th day event_id numbers:  59
3-th day
social messages in 3-th day date:  [datetime.date(2012, 10, 13)]
social messages in 3-th day message numbers:  (1763, 17)
social messages in 3-th day evet_id numbers:  33
social stream in 3-th day date:  [datetime.date(2012, 10, 13) datetime.d

# MAVEN dataset

In [2]:
load_path = './data/raw dataset'  # 相对路径，..表示上上级路径
result_path = './result'

In [5]:
# load dataset
p_part1 = load_path + '/all_df_words_ents_mids.npy'
# Python 中的 pickle 用于在保存到磁盘文件或从磁盘文件读取之前，对对象进行序列化和反序列化
df_part1 = np.load(p_part1, allow_pickle=True)  # allow_pickle, Allow loading pickled object arrays stored in npy files
print('loaded data.')
# df = pd.DataFrame(data=df_part1, columns=['document_ids', 'sentence_ids', 'sentences', 'event_type_ids',
#                                           'words', 'unique_words', 'entities', 'message_ids'])
df = pd.DataFrame(data=df_part1, columns=['user_id', 'sentence_ids', 'text', 'event_id',
                                          'words', 'filtered_words', 'entities', 'tweet_id'])
df['created_at'] = pd.to_datetime('2012-10-10')
df['user_mentions'] = df.user_id.apply(lambda x: [])
print('Data converted to dataframe.')

loaded data.
Data converted to dataframe.


In [6]:
print(df.shape)
print(df.tweet_id.nunique())
print(df.sentence_ids.nunique())
print(df.user_id.nunique())
print(df.event_id.nunique())

(10242, 10)
10242
63
3139
164


In [15]:
# #显示所有列
# pd.set_option('display.max_columns', None)
# #显示所有行
# pd.set_option('display.max_rows', None)
# #设置value的显示长度为100，默认为50
# # pd.set_option('max_colwidth',100)

In [18]:
str(df.loc[0,'words'])

"['accord', 'author', 'Neill', 'Macaulay', 'would', 'attack', 'odd', 'heavily', 'orwhen', 'clearly', 'advantage', 'surprise', 'cover', 'superior', 'firepower']"

In [19]:
str(df.loc[0,'filtered_words'])

"['odd', 'heavily', 'firepower', 'cover', 'orwhen', 'clearly', 'surprise', 'Neill', 'would', 'advantage', 'accord', 'superior', 'attack', 'Macaulay', 'author']"

In [16]:
df.head(10)

Unnamed: 0,user_id,sentence_ids,text,event_id,words,filtered_words,entities,tweet_id,created_at,user_mentions
0,387fe1dfe55067eb29e1fd4116d37af3,14,"According to author Neill Macaulay, ""he would ...",23,"[accord, author, Neill, Macaulay, would, attac...","[odd, heavily, firepower, cover, orwhen, clear...","[(Neill Macaulay, PERSON)]",387fe1dfe55067eb29e1fd4116d37af3_14,2012-10-10,[]
1,387fe1dfe55067eb29e1fd4116d37af3,10,Sandino had a tendency to greatly exaggerate n...,37,"[sandino, tendency, greatly, exaggerate, numbe...","[probably, inaccurate, exaggerate, sandino, re...","[(Sandino, ORG), (60, CARDINAL)]",387fe1dfe55067eb29e1fd4116d37af3_10,2012-10-10,[]
2,268c4763208c87ed7ebf55565c274d23,4,British forces in India were considerably stro...,42,"[british, force, India, considerably, strong, ...","[strong, Cornwallis, RearAdmiral, suppoed, Wil...","[(British, NORP), (India, GPE), (French, NORP)...",268c4763208c87ed7ebf55565c274d23_4,2012-10-10,[]
3,268c4763208c87ed7ebf55565c274d23,8,British forces constructed trenches and batter...,44,"[british, force, construct, trench, battery, o...","[construct, often, heavy, fire, force, british...","[(British, NORP), (the following weeks, DATE)]",268c4763208c87ed7ebf55565c274d23_8,2012-10-10,[]
4,268c4763208c87ed7ebf55565c274d23,0,"""For other sieges with this name, see Siege of...",45,"[siege, name, see, Siege, Pondicherry, disambi...","[Siege, early, stage, military, French, operat...","[(Siege, FAC), (Pondicherry, GPE), (The Siege ...",268c4763208c87ed7ebf55565c274d23_0,2012-10-10,[]
5,268c4763208c87ed7ebf55565c274d23,3,French India was governed from Pondicherry (mo...,41,"[French, India, govern, Pondicherry, modern, P...","[French, India, Coast, Pondicherry, modern, Co...","[(French, NORP), (India, GPE), (Pondicherry, G...",268c4763208c87ed7ebf55565c274d23_3,2012-10-10,[]
6,3bec0b60c0940c5e46ee2cfc9504df92,5,"However, normally they occur at very low speed...",15,"[however, normally, occur, low, speed, less, m...","[speed, less, kmh, normally, low, mph, however...","[(less than 5 mph, QUANTITY), (8 km/h, QUANTITY)]",3bec0b60c0940c5e46ee2cfc9504df92_5,2012-10-10,[]
7,39c2db9e18cd4a02b9aa8c1a3c58aab7,8,They were apprehended and taken to separate ho...,1,"[apprehend, take, separate, hospital]","[apprehend, hospital, separate, take]",[],39c2db9e18cd4a02b9aa8c1a3c58aab7_8,2012-10-10,[]
8,c1ae08941a115c7d45e205724e5aa3be,18,"It had begun shortly before 16 October, if bot...",31,"[begin, sholy, October, Chronica, obituary, Bu...","[Burgos, accurate, October, obituary, begin, C...","[(16 October, DATE), (Chronica, PRODUCT), (Bur...",c1ae08941a115c7d45e205724e5aa3be_18,2012-10-10,[]
9,c1ae08941a115c7d45e205724e5aa3be,36,This will was later confirmed at Sariñena on 4...,18,"[later, confirm, Sariñena, September]","[September, later, confirm, Sariñena]","[(Sariñena, ORG), (4 September 1134, DATE)]",c1ae08941a115c7d45e205724e5aa3be_36,2012-10-10,[]
