## 查看数据分布

In [3]:
# 查看 twitter data 数据格式和分布:frequency、numbers、shape etc.

In [4]:
import numpy as np
import pandas as pd
import datetime

### data_path

In [5]:
load_path = './data/raw dataset'  # 相对路径，..表示上上级路径
result_path = './result'

In [6]:
# load dataset
p_part1 = load_path + '/68841_tweets_multiclasses_filtered_0722_part1.npy'
print(p_part1)
p_part2 = load_path + '/68841_tweets_multiclasses_filtered_0722_part2.npy'
# Python 中的 pickle 用于在保存到磁盘文件或从磁盘文件读取之前，对对象进行序列化和反序列化
df_np_part1 = np.load(p_part1, allow_pickle=True)  # allow_pickle, Allow loading pickled object arrays stored in npy files
df_np_part2 = np.load(p_part2, allow_pickle=True)
df = np.concatenate((df_np_part1, df_np_part2),axis=0) # 按行拼接
print('loaded data.')
df = pd.DataFrame(data=df, columns=['event_id','tweet_id','text','user_id','created_at','user_loc','place_type',
                                      'place_full_name','place_country_code','hashtags','user_mentions','image_urls',
                                      'entities','words','filtered_words','sampled_words'])
print('Data converted to dataframe.')

./data/raw dataset/68841_tweets_multiclasses_filtered_0722_part1.npy
loaded data.
Data converted to dataframe.


In [7]:
# sort date by time
df = df.sort_values(by='created_at').reset_index(drop=True)
# append date
df['date'] = [d.date() for d in df['created_at']]
# -------------------------------------------------------------------
init_day = df.loc[0, 'date']

In [9]:
print('total message numbers:', df.shape)   
print('total number of event classes:', df.event_id.nunique()) 

total message numbers: (68841, 17)
total number of event classes: 503


In [10]:
df.head(10)

Unnamed: 0,event_id,tweet_id,text,user_id,created_at,user_loc,place_type,place_full_name,place_country_code,hashtags,user_mentions,image_urls,entities,words,filtered_words,sampled_words,date
0,394,255819992157786112,HipHop awards bout to be live!!,250870763,2012-10-10 00:00:13,,,,,[],[],[],[],"[award, live, bout, hiphop]","[award, live, bout, hiphop]",[],2012-10-10
1,394,255820118095978496,HIPHOP AWARDS TIME!,28026779,2012-10-10 00:00:43,SoundCloud/RaRaSupaStar,,,,[],[],[],[],"[HIPHOP, AWARDS, time]","[hiphop, awards, time]",[],2012-10-10
2,394,255820147489636353,Bet hiphop awards,566825483,2012-10-10 00:00:50,,,,,[],[],[],"[(Bet, GPE)]","[award, bet, hiphop]","[award, bet, hiphop]",[],2012-10-10
3,394,255820164023595008,BET HipHop awards is on!!!,197834311,2012-10-10 00:00:54,Saint Lucia ☀️🌴🇱🇨,,,,[],[],[],[],"[HipHop, BET, award]","[hiphop, bet, award]",[],2012-10-10
4,394,255820180884701184,Watchin Da BET Hiphop Awards,439490861,2012-10-10 00:00:58,"Michigan, USA",,,,[],[],[],[],"[Hiphop, Watchin, Awards, Da, BET]","[hiphop, watchin, awards, da, bet]",[],2012-10-10
5,394,255820336002646016,Tuned Into The Hiphop Awards,197259280,2012-10-10 00:01:35,,,,,[],[],[],[],"[Awards, Hiphop, tune]","[awards, hiphop, tune]",[],2012-10-10
6,394,255820419943247872,Bout 2 tune into the hiphop awards,103100887,2012-10-10 00:01:55,,,,,[],[],[],"[(Bout 2, PERSON)]","[award, Bout, hiphop, tune]","[award, bout, hiphop, tune]",[],2012-10-10
7,394,255820487307964416,Hiphop awards on finna tune in,40153735,2012-10-10 00:02:11,,,,,[],[],[],[],"[award, finna, Hiphop, tune]","[award, finna, hiphop, tune]",[],2012-10-10
8,487,255820516613578754,Ready Too See My Nigga 2 Chainz Perform!,437229768,2012-10-10 00:02:18,"Lafayette, AL",,,,[],[],[],[],"[Perform, Ready, see, Nigga, Chainz]","[perform, ready, see, nigga, chainz]",[],2012-10-10
9,394,255820562629263360,#workhard !!!\n#wiz \n#BET HipHop Awards !!,600320911,2012-10-10 00:02:29,Philadelphia,,,,"[workhard, wiz, BET]",[],[],"[(#, CARDINAL), (#wiz, MONEY)]","[award, hiphop]","[award, hiphop]",[],2012-10-10


## data_preprocess

In [11]:
# remove repeated messages and those associated with multiple event classes
# sort date by time
df = df.sort_values(by='created_at').reset_index(drop=True)
# append date
df['date'] = [d.date() for d in df['created_at']]

In [12]:
# remove repeated messages
df_duplicates = df.drop_duplicates(subset=['tweet_id'], keep='first')

In [13]:
print('duplicated total message numbers:', df_duplicates.shape)   
print('duplicated total number of event classes:', df_duplicates.event_id.nunique()) 

duplicated total message numbers: (68841, 17)
duplicated total number of event classes: 503


In [18]:
df_duplicates.tweet_id.head()

0    255819992157786112
1    255820118095978496
2    255820147489636353
3    255820164023595008
4    255820180884701184
Name: tweet_id, dtype: object

## df_extract

In [24]:
df.shape

(68841, 17)

In [25]:
df_count = df.event_id.value_counts().to_frame()
df_count_2 = df_count[df_count.event_id >= 2].reset_index()
df_count_2

Unnamed: 0,index,event_id
0,8,7133
1,1,3358
2,11,2859
3,157,1550
4,22,1399
...,...,...
491,346,3
492,311,3
493,206,2
494,392,2


In [26]:
# extract and descend according to time
df_extract = df[df.event_id.isin(df_count_2.index)]
print(df_extract.shape)
print(df_extract.event_id.nunique())

(68043, 17)
493


In [32]:
# 全部显示
pd.set_option('max_colwidth', None)
df[df['event_id']==8].head()

Unnamed: 0,event_id,tweet_id,text,user_id,created_at,user_loc,place_type,place_full_name,place_country_code,hashtags,user_mentions,image_urls,entities,words,filtered_words,sampled_words,date
19739,8,258327573812109317,Debate comes on to tonight !!!!!! #Obama,110040608,2012-10-16 22:04:27,Southern Cali✈ATL,,,,[Obama],[],[],"[(tonight, TIME), (#, CARDINAL)]","[come, tonight, debate]","[come, tonight, debate]",[],2012-10-16
19741,8,258327737322844161,"Debate Tonight TurnUp Tuesday @9pm. I just got of the phone Obama ! He Said ""I got this my Brother"" #2terms",30707097,2012-10-16 22:05:06,"South Carolina, Conway",,,,[2terms],[16286578],[],"[(Tuesday, DATE), (@9pm, PERSON), (Obama, GPE)]","[Obama, debate, brother, get, phone, TurnUp, say, Tuesday, Tonight]","[obama, debate, brother, get, phone, turnup, say, tuesday, tonight]",[],2012-10-16
19754,8,258329230482501633,"Photoset: Hempstead, NY Obama-Romney 2nd debate http://t.co/XDb95u6e",27631423,2012-10-16 22:11:02,Italia,,,,[],[],[],"[(Hempstead, ORG), (NY Obama-Romney 2nd, PERSON)]","[Hempstead, 2nd, debate, ObamaRomney, Photoset, NY]","[hempstead, debate, obamaromney, photoset, ny]",[obamaromney],2012-10-16
19755,8,258329381552934913,Do you think #Obama will be able to rebound tonight? Or will #Romney keep his momentum going?,575961236,2012-10-16 22:11:38,"Anaheim Hills, CA",,,,"[Obama, Romney]",[],[],"[(#, CARDINAL), (tonight, TIME), (#, CARDINAL), (Romney, ORG)]","[keep, tonight, think, momentum, able, rebound, go]","[keep, tonight, think, momentum, able, rebound, go]",[],2012-10-16
19757,8,258329515422531584,"If it gets to tough for Obama, he can always hide behind Crowley.",445019219,2012-10-16 22:12:10,,,,,[],[],[],"[(Obama, GPE), (Crowley, ORG)]","[always, tough, Obama, get, hide, Crowley, behind]","[always, tough, obama, get, hide, crowley, behind]",[],2012-10-16


In [33]:
df_extract.loc['created_at'] = pd.to_datetime(df_extract['created_at'])
df_time = df_extract.sort_values(by='created_at', ascending=False)
df_time

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_extract.loc['created_at'] = pd.to_datetime(df_extract['created_at'])


Unnamed: 0,event_id,tweet_id,text,user_id,created_at,user_loc,place_type,place_full_name,place_country_code,hashtags,user_mentions,image_urls,entities,words,filtered_words,sampled_words,date
68840,430,265966797839273984,If romney wins idk i could leave the country cuz i cant afford to take my whole fam,432194703,2012-11-07 00:00:00,,,,,[],[],[],[],"[not, cuz, afford, idk, country, can, whole, win, leave, take, fam, romney, could]","[not, cuz, afford, idk, country, can, whole, win, leave, take, fam, romney, could]",[],2012-11-07
68839,197,265966617589084160,"Bombings rock Damascus, brother of parliament speaker killed: ""Anything, anything, to get that man out of the co... http://t.co/DgLLkr77",311802961,2012-11-06 23:59:17,,,,,[],[],[],"[(Damascus, GPE)]","[Anything, Damascus, rock, kill, brother, co, parliament, get, man, anything, speaker, bombing]","[anything, damascus, rock, kill, brother, co, parliament, get, man, anything, speaker, bombing]",[],2012-11-06
68838,52,265966529496092675,About to watch the election presentation,283768568,2012-11-06 23:58:56,,,,,[],[],[],[],"[election, watch, presentation]","[election, watch, presentation]",[],2012-11-06
68837,52,265966512672759808,I'm watching United States Presidential Election 2012 (3902 others checked-in) http://t.co/o6yLeDqM #GetGlue #election2012,115979133,2012-11-06 23:58:52,,,,,"[GetGlue, election2012]",[],[],"[(United States Presidential Election 2012, ORG), (3902, CARDINAL), (#GetGlue #, MONEY)]","[be, United, -PRON-, other, Election, checkedin, States, watch, Presidential]","[be, united, other, election, checkedin, states, watch, presidential]",[checkedin],2012-11-06
68836,67,265966361803640832,"“@5ftOfReal: 3 states might legalize marijuana, the world is changing lol” nc?",257380081,2012-11-06 23:58:16,2 5 2,,,,[],[],[],"[(3, CARDINAL)]","[state, legalize, may, "", marijuana, nc, change, world, “, lol]","[state, legalize, may, marijuana, nc, change, world, lol]",[],2012-11-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,394,255820164023595008,BET HipHop awards is on!!!,197834311,2012-10-10 00:00:54,Saint Lucia ☀️🌴🇱🇨,,,,[],[],[],[],"[HipHop, BET, award]","[hiphop, bet, award]",[],2012-10-10
2,394,255820147489636353,Bet hiphop awards,566825483,2012-10-10 00:00:50,,,,,[],[],[],"[(Bet, GPE)]","[award, bet, hiphop]","[award, bet, hiphop]",[],2012-10-10
1,394,255820118095978496,HIPHOP AWARDS TIME!,28026779,2012-10-10 00:00:43,SoundCloud/RaRaSupaStar,,,,[],[],[],[],"[HIPHOP, AWARDS, time]","[hiphop, awards, time]",[],2012-10-10
0,394,255819992157786112,HipHop awards bout to be live!!,250870763,2012-10-10 00:00:13,,,,,[],[],[],[],"[award, live, bout, hiphop]","[award, live, bout, hiphop]",[],2012-10-10


In [34]:
df_extract.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68044 entries, 0 to created_at
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   event_id            68043 non-null  object        
 1   tweet_id            68043 non-null  object        
 2   text                68043 non-null  object        
 3   user_id             68043 non-null  object        
 4   created_at          68043 non-null  datetime64[ns]
 5   user_loc            68043 non-null  object        
 6   place_type          68043 non-null  object        
 7   place_full_name     68043 non-null  object        
 8   place_country_code  68043 non-null  object        
 9   hashtags            68043 non-null  object        
 10  user_mentions       68043 non-null  object        
 11  image_urls          68043 non-null  object        
 12  entities            68043 non-null  object        
 13  words               68043 non-null  object    

## df_offline

In [29]:
df.date.unique()

array([datetime.date(2012, 10, 10), datetime.date(2012, 10, 11),
       datetime.date(2012, 10, 12), datetime.date(2012, 10, 13),
       datetime.date(2012, 10, 14), datetime.date(2012, 10, 15),
       datetime.date(2012, 10, 16), datetime.date(2012, 10, 17),
       datetime.date(2012, 10, 18), datetime.date(2012, 10, 19),
       datetime.date(2012, 10, 20), datetime.date(2012, 10, 21),
       datetime.date(2012, 10, 22), datetime.date(2012, 10, 23),
       datetime.date(2012, 10, 24), datetime.date(2012, 10, 25),
       datetime.date(2012, 10, 26), datetime.date(2012, 10, 27),
       datetime.date(2012, 10, 28), datetime.date(2012, 10, 29),
       datetime.date(2012, 10, 30), datetime.date(2012, 10, 31),
       datetime.date(2012, 11, 1), datetime.date(2012, 11, 2),
       datetime.date(2012, 11, 3), datetime.date(2012, 11, 4),
       datetime.date(2012, 11, 5), datetime.date(2012, 11, 6),
       datetime.date(2012, 11, 7)], dtype=object)

In [37]:
# social message in the first week of offline experimental dataset
init_day = df.loc[0,'date']
# offline dataset
df_offline = df_extract[(df_extract['date']>= init_day) & (df_extract['date']<= init_day + datetime.timedelta(days=6))].reset_index()  # (11971, 18)

In [42]:
df_offline.date.unique()

array([datetime.date(2012, 10, 10), datetime.date(2012, 10, 11),
       datetime.date(2012, 10, 12), datetime.date(2012, 10, 13),
       datetime.date(2012, 10, 14), datetime.date(2012, 10, 15),
       datetime.date(2012, 10, 16)], dtype=object)

In [39]:
print(df_offline.shape)   # (4762, 18)
print(df_offline.event_id.nunique())  # 57

(19638, 18)
149


In [31]:
init_day

datetime.date(2012, 10, 10)

## df_incremental

In [74]:
# social messages in 8th to 14th days
for i in range(1,8):
    day_label = str(7+i) + '-th day'
    print(day_label)
    df_messages = df_extract[df_extract['date'] == init_day + datetime.timedelta(days=i)]
    print('social messages in ' + day_label + ' date: ', df_messages.date.unique())
    print('social messages in ' + day_label + ' message numbers: ', df_messages.shape)   # (4762, 18)
    print('social messages in ' + day_label + ' evet_id numbers: ', df_messages.event_id.nunique())  # 57
    df_stream = df_extract[(df_extract['date']>= init_day + datetime.timedelta(days=i)) &
                        (df_extract['date']<= init_day + datetime.timedelta(days=6+i))].reset_index()
    print('social stream in ' + day_label + ' date: ', df_stream.date.unique())
    print('social stream in ' + day_label + ' message numbers: ', df_stream.shape)   # (4762, 18)
    print('social stream in ' + day_label + ' event_id numbers: ', df_stream.event_id.nunique())  # 57

8-th day
social messages in 8-th day date:  [datetime.date(2012, 10, 11)]
social messages in 8-th day message numbers:  (1786, 17)
social messages in 8-th day evet_id numbers:  40
social stream in 8-th day date:  [datetime.date(2012, 10, 11) datetime.date(2012, 10, 12)
 datetime.date(2012, 10, 13) datetime.date(2012, 10, 14)
 datetime.date(2012, 10, 15) datetime.date(2012, 10, 16)
 datetime.date(2012, 10, 17)]
social stream in 8-th day message numbers:  (25952, 18)
social stream in 8-th day event_id numbers:  146
9-th day
social messages in 9-th day date:  [datetime.date(2012, 10, 12)]
social messages in 9-th day message numbers:  (5428, 17)
social messages in 9-th day evet_id numbers:  47
social stream in 9-th day date:  [datetime.date(2012, 10, 12) datetime.date(2012, 10, 13)
 datetime.date(2012, 10, 14) datetime.date(2012, 10, 15)
 datetime.date(2012, 10, 16) datetime.date(2012, 10, 17)
 datetime.date(2012, 10, 18)]
social stream in 9-th day message numbers:  (25657, 18)
social stre