# Prepare Data.

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import pytz
# import deepcut
from datetime import datetime, timezone
from pythainlp import word_tokenize
from pythainlp.corpus.common import thai_stopwords

# Read data.

In [2]:
def read_data(tweets_data_path):
    """ Function read data. """
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
    tweets_file.close()
    return tweets_data

In [3]:
# Combine data into a list tweets_data.
# tweets_data = read_data('data/28-09-62.txt')
# tweets_data = read_data('data/28-09-62-2.txt')
tweets_data = read_data('data/28-09-62-3.txt')
# tweets_data = read_data('data/29-09-62.txt')
# tweets_data = read_data('data/29-09-62-2.txt')

In [4]:
# tweets_data[:10]

In [5]:
len(tweets_data)

560904

## Reformat time and convert time zones UTC To ICT

In [6]:
thai_tz = pytz.timezone('Asia/Bangkok') # thai time zone

In [7]:
# find error in data
lst_error = []
for i in range(len(tweets_data)):
    try:
        tweets_data[i]['created_at'].split(' ')
    except:
        lst_error.append(i)

In [8]:
print(lst_error)
print('----------')
print(len(lst_error))

[9485, 10642, 35721, 39327, 39895, 42307, 43572, 44590, 45424, 45836, 46102, 46552, 46866, 47350, 47401, 47486, 48391, 49176, 49412, 49546, 49829, 50455, 50789, 51618, 53884, 53935, 54131, 54224, 54357, 54497, 54657, 54905, 55164, 55797, 55892, 59680, 60876, 61202, 61729, 61861, 62272, 63322, 63628, 63679, 64753, 64880, 64931, 65028, 65157, 65525, 65785, 66094, 66416, 66511, 66954, 67205, 67618, 68675, 68726, 70791, 70925, 71326, 72072, 72993, 73577, 73858, 75072, 75172, 75977, 76537, 77016, 77067, 77208, 77465, 77811, 78173, 79148, 79484, 79619, 79670, 80422, 81050, 81257, 81390, 81564, 82060, 82307, 82480, 82905, 82956, 83042, 83255, 83480, 83856, 84299, 84762, 84851, 85038, 85089, 85406, 85583, 85913, 86602, 86789, 86840, 86941, 87096, 87188, 87370, 87585, 87728, 88037, 88131, 88913, 89127, 89295, 89903, 90542, 90593, 90841, 91698, 92706, 92985, 93112, 93754, 93805, 94258, 94662, 94757, 94808, 94949, 95653, 95754, 95853, 95975, 96063, 96245, 96345, 96396, 96663, 96980, 97116, 97167,

In [9]:
for i in range(len(tweets_data)):
    if i not in lst_error:
        lst = tweets_data[i]['created_at'].split(' ')
        date = (lst[5]+'.'+lst[1]+'.'+lst[2]+' '+lst[3])
        date_time = datetime.strptime(date, '%Y.%b.%d %H:%M:%S')
        tweets_data[i]['created_at'] =  date_time.replace(tzinfo=timezone.utc).astimezone(thai_tz).strptime(date, '%Y.%b.%d %H:%M:%S') # reset info and convert to thai timezone

In [10]:
time = [tweets_data[i]['created_at'] for i in range(len(tweets_data)) if i not in lst_error]
language = [tweets_data[i]['lang'] for i in range(len(tweets_data)) if i not in lst_error]
source = [tweets_data[i]['source'][tweets_data[i]['source'].find('w">')+3:].replace('</a>', '') for i in range(len(tweets_data)) if i not in lst_error]

## Create Data frame

In [11]:
df = pd.DataFrame({'time':time, 'language':language, 'source':source}) 

In [12]:
# df['source'].value_counts()

## Add group column into data frame

In [13]:
bins = list(range(0, 25*3600, 1*3600)) # Divide the time into 24 bin ( 1 hours / bin )
labels = ['0 pm','1 am','2 am','3 am','4 am',
          '5 am','6 am','7 am','8 am','9 am',
          '10 am','11 am','12 am','13 pm','14 pm',
         '15 pm','16 pm','17 pm','18 pm','19 pm',
          '20 pm','21 pm','22 pm','23 pm'] # set label
# print(list(range(0, 25, 2)))
# bins

In [14]:
df['sec'] = df.time.dt.hour * 3600 + df.time.dt.minute*60 + df.time.dt.second # convert time unit to second.
df['group'] = pd.cut(df['sec'], bins=bins, labels=labels) # set bin label into group column.

In [15]:
df.drop('sec', inplace=True, axis=1)

In [16]:
# df.head()

## Hashtag to data frame

In [17]:
hashtag_all = []
for j in range(len(tweets_data)):
    if j not in lst_error:
        lst = []
        for i in range(len(tweets_data[j]['entities']['hashtags'])):
            lst.append(tweets_data[j]['entities']['hashtags'][i]['text'])
        hashtag_all.append(lst)

In [18]:
# hashtag_all

In [19]:
df['hashtag'] = hashtag_all

In [20]:
# df.head()

-----------

In [21]:
# alltext = []

# try: # Check error in data
#     for i in range(len(tweets_data)):
#         if i not in lst_error: # ensure that message, not in the list of error.
#             if re.search('Microsoft PowerApps and Flow',tweets_data[i]['source']): # if tweet posted by Microsoft PowerApps, the key for access text will have only one then this condition must be checked first.
#                 text = tweets_data[i]['text']
#                 alltext.append(text)
#             elif re.search('^RT' ,tweets_data[i]['text']): # check type of tweet [RT = retweet]
#                 if re.search('…$' ,tweets_data[i]['text']): # check the message that full message or summary message.
#                     try: # in normal case
#                         text = tweets_data[i]['retweeted_status']['extended_tweet']['full_text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                         print(text)
# #                         print('--------------------')
#                     except: # in case of a full message not in key 'extended_tweet'.
#                         text = tweets_data[i]['retweeted_status']['text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                         print(text)
# #                         print('--------------------')
#                 else:
#                     if re.search('… https://' ,tweets_data[i]['text']): # check the text is condensed but the embedded link is unabridged.
#                         text = tweets_data[i]['extended_tweet']['full_text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                       print(text)
# #                       print('--------------------')
#                     else:
#                         text = tweets_data[i]['retweeted_status']['text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
#             else: # in case of normal tweets
#                 if re.search('… https//' ,tweets_data[i]['text']): # check the text is condensed but the embedded link is unabridged.
#                     text = tweets_data[i]['extended_tweet']['full_text'] # access text.
#                     alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
#                 else:
#                     text = tweets_data[i]['text'] # access text.
#                     alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
# except:
#     print(f'ERROR FOUND!! INDEX : {i}')

In [22]:
# len(alltext)

In [23]:
df.to_csv('prepared03.csv', encoding = 'utf8')

In [24]:
# data = pd.read_csv('prepared01.csv')

In [52]:
# data