# Prepare Data.

In [68]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import pytz
# import deepcut
from datetime import datetime, timezone
from pythainlp import word_tokenize
from pythainlp.corpus.common import thai_stopwords

# Read data.

In [69]:
def read_data(tweets_data_path):
    """ Function read data. """
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
    tweets_file.close()
    return tweets_data

In [70]:
# Combine data into a list tweets_data.
tweets_data = read_data('data/07-09-62-1.txt')

In [71]:
tweets_data[:10]

[{'created_at': 'Sat Sep 07 01:21:50 +0000 2019',
  'id': 1170145116549287937,
  'id_str': '1170145116549287937',
  'text': 'RT @______1__2_3: เธอออ ฝากแฟนชานท์น้าาา เค้าจะได้ไม่พูดคนเดียว😅เพลงของx1ทั้งหลายย #KCON2019THAILAND https://t.co/HYVf6IV3eE',
  'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
  'truncated': False,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 1007826148590436352,
   'id_str': '1007826148590436352',
   'name': 'รักคุณเสียยิ่งกว่าใครมอบกล่องดวงใจเอาไว้ที่คุณ',
   'screen_name': 'kwanjirakoko',
   'location': 'กรุงเทพมหานคร, ประเทศไทย',
   'url': None,
   'description': None,
   'translator_type': 'none',
   'protected': False,
   'verified': False,
   'followers_count': 26,
   'friends_count': 31,
   'listed_count': 0,
   'favourites_count': 198,
   'statuses_count': 1476,
   

In [72]:
len(tweets_data)

8648

## Reformat time and convert time zones UTC To ICT

In [73]:
thai_tz = pytz.timezone('Asia/Bangkok') # thai time zone

In [74]:
# find error in data
lst_error = []
for i in range(len(tweets_data)):
    try:
        tweets_data[i]['created_at'].split(' ')
    except:
        lst_error.append(i)

In [75]:
print(lst_error)
print('----------')
print(len(lst_error))

[]
----------
0


In [76]:
for i in range(len(tweets_data)):
    if i not in lst_error:
        lst = tweets_data[i]['created_at'].split(' ')
        date = (lst[5]+'.'+lst[1]+'.'+lst[2]+' '+lst[3])
        date_time = datetime.strptime(date, '%Y.%b.%d %H:%M:%S')
        tweets_data[i]['created_at'] =  date_time.replace(tzinfo=timezone.utc).astimezone(thai_tz).strptime(date, '%Y.%b.%d %H:%M:%S') # reset info and convert to thai timezone

In [115]:
time = [tweets_data[i]['created_at'] for i in range(len(tweets_data)) if i not in lst_error]
language = [tweets_data[i]['lang'] for i in range(len(tweets_data)) if i not in lst_error]
source = [tweets_data[i]['source'][tweets_data[i]['source'].find('w">')+3:].replace('</a>', '') for i in range(len(tweets_data)) if i not in lst_error]

## Create Data frame

In [123]:
df = pd.DataFrame({'time':time, 'language':language, 'source':source}) 

In [128]:
df['source'].value_counts()

Twitter for Android        3972
Twitter for iPhone         3947
Twitter Web App             454
Twitter for iPad            245
Twitter Web Client           11
TweetDeck                     8
TwitPane for Android          3
Twitter Media Studio          3
TweetCaster for Android       2
Plume for Android             1
Tweetbot for iΟS              1
Shopee_TH                     1
Name: source, dtype: int64

## Add group column into data frame

In [129]:
bins = list(range(0, 25*3600, 1*3600)) # Divide the time into 24 bin ( 1 hours / bin )
labels = ['0 pm','1 am','2 am','3 am','4 am',
          '5 am','6 am','7 am','8 am','9 am',
          '10 am','11 am','12 am','13 pm','14 pm',
         '15 pm','16 pm','17 pm','18 pm','19 pm',
          '20 pm','21 pm','22 pm','23 pm'] # set label
# print(list(range(0, 25, 2)))
# bins

In [130]:
df['sec'] = df.time.dt.hour * 3600 + df.time.dt.minute*60 + df.time.dt.second # convert time unit to second.
df['group'] = pd.cut(df['sec'], bins=bins, labels=labels) # set bin label into group column.

In [135]:
df.drop('sec', inplace=True, axis=1)

In [190]:
# df.head()

## Hashtag to data frame

In [178]:
hashtag_all = []
for j in range(len(tweets_data)):
    if j not in lst_error:
        lst = []
        for i in range(len(tweets_data[j]['entities']['hashtags'])):
            lst.append(tweets_data[j]['entities']['hashtags'][i]['text'])
        hashtag_all.append(lst)

In [179]:
hashtag_all

[['KCON2019THAILAND'],
 [],
 ['ZEYU', 'เจ๋ออวี่', 'BOYSTORY', 'KCON2019THAILAND', 'KCON19TH'],
 ['KIMJAEHWAN', 'KCONTHAILAND2019', 'KCON19TH', 'SeoulStreetFestivalTH'],
 ['GOT7', 'KCONTHAILAND2019', 'KCON2019THAILAND', 'ตลาดนัดอากาซ่', 'KCON19TH'],
 ['รองเท้าadidas',
  'รองเท้ามือสองของแท้',
  'รองเท้าผ้าใบ',
  'เกียมอุดม',
  'โยธินมรณะ',
  'ส่งต่อเสื้อผ้า',
  'เครื่องสําอางเกาหลี'],
 ['EXplOrationinBKK'],
 ['KCON2019THAILAND', 'TwitterBlueroom', 'THEBOYZ'],
 ['KCON2019THAILAND', 'TwitterBlueroom', 'HiPrae'],
 ['Straykids', 'KCONTHAILAND2019', 'KCON2019THAILAND'],
 ['BTS', 'got7', 'ไม่ดราม่านะจ๊ะ'],
 ['X1', 'X1_Debut', 'X1_FLAϟH', 'KCON2019Thailand'],
 ['KCON2019THAILAND'],
 ['X1', 'KCON2019THAILAND', 'MarketerK', 'MCDxKCON', 'KCON2019'],
 ['KCON2019THAILAND'],
 [],
 ['KCON2019THAILAND'],
 ['นัดรับ', 'บัตรแข็ง', 'KCON2019THAILAND'],
 ['โปรเจคเด็กหลง', 'StrayKids', 'KCON2019THAILAND'],
 ['KCON2019THAILAND'],
 [],
 [],
 ['KCON2019THAILAND'],
 ['KCON2019THAILAND'],
 [],
 ['KCON2019THAILAN

In [181]:
df['hashtag'] = hashtag_all

In [192]:
# df.head()

-----------

In [206]:
# alltext = []

# try: # Check error in data
#     for i in range(len(tweets_data)):
#         if i not in lst_error: # ensure that message, not in the list of error.
#             if re.search('Microsoft PowerApps and Flow',tweets_data[i]['source']): # if tweet posted by Microsoft PowerApps, the key for access text will have only one then this condition must be checked first.
#                 text = tweets_data[i]['text']
#                 alltext.append(text)
#             elif re.search('^RT' ,tweets_data[i]['text']): # check type of tweet [RT = retweet]
#                 if re.search('…$' ,tweets_data[i]['text']): # check the message that full message or summary message.
#                     try: # in normal case
#                         text = tweets_data[i]['retweeted_status']['extended_tweet']['full_text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                         print(text)
# #                         print('--------------------')
#                     except: # in case of a full message not in key 'extended_tweet'.
#                         text = tweets_data[i]['retweeted_status']['text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                         print(text)
# #                         print('--------------------')
#                 else:
#                     if re.search('… https://' ,tweets_data[i]['text']): # check the text is condensed but the embedded link is unabridged.
#                         text = tweets_data[i]['extended_tweet']['full_text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                       print(text)
# #                       print('--------------------')
#                     else:
#                         text = tweets_data[i]['retweeted_status']['text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
#             else: # in case of normal tweets
#                 if re.search('… https//' ,tweets_data[i]['text']): # check the text is condensed but the embedded link is unabridged.
#                     text = tweets_data[i]['extended_tweet']['full_text'] # access text.
#                     alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
#                 else:
#                     text = tweets_data[i]['text'] # access text.
#                     alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
# except:
#     print(f'ERROR FOUND!! INDEX : {i}')

In [207]:
# len(alltext)

In [204]:
df.to_csv('prepared.csv', encoding = 'utf-8')

In [205]:
data = pd.read_csv('prepared.csv')

In [203]:
data

Unnamed: 0.1,Unnamed: 0,time,language,source,group,hashtag
0,0,2019-09-07 01:21:50,th,Twitter for iPhone,1 am,['KCON2019THAILAND']
1,1,2019-09-07 01:22:05,und,Twitter for Android,1 am,[]
2,2,2019-09-07 01:22:23,th,Twitter for Android,1 am,"['ZEYU', 'เจ๋ออวี่', 'BOYSTORY', 'KCON2019THAI..."
3,3,2019-09-07 01:23:35,th,Twitter for Android,1 am,"['KIMJAEHWAN', 'KCONTHAILAND2019', 'KCON19TH',..."
4,4,2019-09-07 01:23:49,th,Twitter for Android,1 am,"['GOT7', 'KCONTHAILAND2019', 'KCON2019THAILAND..."
5,5,2019-09-07 01:23:51,und,Twitter for iPhone,1 am,"['รองเท้าadidas', 'รองเท้ามือสองของแท้', 'รองเ..."
6,6,2019-09-07 01:24:01,th,Twitter for iPhone,1 am,['EXplOrationinBKK']
7,7,2019-09-07 01:24:03,en,Twitter for Android,1 am,"['KCON2019THAILAND', 'TwitterBlueroom', 'THEBO..."
8,8,2019-09-07 01:24:13,en,Twitter for Android,1 am,"['KCON2019THAILAND', 'TwitterBlueroom', 'HiPrae']"
9,9,2019-09-07 01:24:18,th,Twitter for Android,1 am,"['Straykids', 'KCONTHAILAND2019', 'KCON2019THA..."
