# Prepare Data.

In [70]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import pytz
# import deepcut
from datetime import datetime, timezone
from pythainlp import word_tokenize
from pythainlp.corpus.common import thai_stopwords

# Read data.

In [71]:
def read_data(tweets_data_path):
    """ Function read data. """
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
    tweets_file.close()
    return tweets_data

In [95]:
# Combine data into a list tweets_data.
# tweets_data = read_data('data/28-09-62.txt')
# tweets_data = read_data('data/28-09-62-2.txt')
tweets_data = read_data('data/split_28-09-62-3/4.txt')
# tweets_data = read_data('data/29-09-62.txt')
# tweets_data = read_data('data/29-09-62-2.txt')

In [96]:
# tweets_data[0]

In [97]:
len(tweets_data)

115411

## Reformat time and convert time zones UTC To ICT

In [98]:
thai_tz = pytz.timezone('Asia/Bangkok') # thai time zone

In [99]:
# find error in data
lst_error = []
for i in range(len(tweets_data)):
    try:
        tweets_data[i]['created_at'].split(' ')
    except:
        lst_error.append(i)

In [100]:
print(lst_error)
print('----------')
print(len(lst_error))

[36, 87, 138, 189, 288, 339, 390, 539, 590, 736, 787, 838, 889, 940, 991, 1091, 1235, 1286, 1337, 1388, 1439, 1490, 1541, 1592, 1643, 1694, 1745, 1796, 1847, 1939, 1990, 2088, 2139, 2190, 2386, 2437, 2488, 2574, 2673, 2724, 2775, 2873, 2969, 3020, 3071, 3122, 3173, 3271, 3372, 3423, 3514, 3565, 3752, 3844, 3895, 3946, 3998, 4048, 4099, 4150, 4201, 4252, 4303, 4354, 4405, 4456, 4507, 4558, 4609, 4660, 4711, 4762, 4813, 4864, 4915, 4966, 5017, 5068, 5119, 5170, 5221, 5272, 5323, 5374, 5425, 5476, 5527, 5578, 5629, 5680, 5731, 5782, 5833, 5884, 5935, 5986, 6037, 6088, 6139, 6190, 6241, 6292, 6343, 6394, 6445, 6496, 6547, 6598, 6649, 6700, 6751, 6802, 6853, 6904, 6956, 7006, 7057, 7108, 7159, 7210, 7261, 7312, 7363, 7414, 7514, 7565, 7616, 7667, 7718, 7819, 7870, 7921, 7972, 8023, 8121, 8172, 8223, 8417, 8517, 8568, 8619, 8670, 8721, 8772, 8823, 8874, 8925, 9024, 9125, 9176, 9227, 9278, 9329, 9380, 9431, 9482, 9533, 9584, 9635, 9686, 9737, 9788, 9839, 9890, 9941, 9992, 10043, 10094, 10145,


----------
1238


In [101]:
# tweets_data[110754]

In [102]:
for i in range(len(tweets_data)):
    if i not in lst_error:
        lst = tweets_data[i]['created_at'].split(' ')
        date = (lst[5]+'.'+lst[1]+'.'+lst[2]+' '+lst[3])
        date_time = datetime.strptime(date, '%Y.%b.%d %H:%M:%S')
        tweets_data[i]['created_at'] =  date_time.replace(tzinfo=timezone.utc).astimezone(thai_tz).strptime(date, '%Y.%b.%d %H:%M:%S') # reset info and convert to thai timezone

In [103]:
time = [tweets_data[i]['created_at'] for i in range(len(tweets_data)) if i not in lst_error]
language = [tweets_data[i]['lang'] for i in range(len(tweets_data)) if i not in lst_error]
source = [tweets_data[i]['source'][tweets_data[i]['source'].find('w">')+3:].replace('</a>', '') for i in range(len(tweets_data)) if i not in lst_error]

## Create Data frame

In [104]:
df = pd.DataFrame({'time':time, 'language':language, 'source':source}) 

In [105]:
# df['source'].value_counts()

## Add group column into data frame

In [106]:
bins = list(range(0, 25*3600, 1*3600)) # Divide the time into 24 bin ( 1 hours / bin )
labels = ['0 pm','1 am','2 am','3 am','4 am',
          '5 am','6 am','7 am','8 am','9 am',
          '10 am','11 am','12 am','13 pm','14 pm',
         '15 pm','16 pm','17 pm','18 pm','19 pm',
          '20 pm','21 pm','22 pm','23 pm'] # set label
# print(list(range(0, 25, 2)))
# bins

In [107]:
df['sec'] = df.time.dt.hour * 3600 + df.time.dt.minute*60 + df.time.dt.second # convert time unit to second.
df['group'] = pd.cut(df['sec'], bins=bins, labels=labels) # set bin label into group column.

In [108]:
df.drop('sec', inplace=True, axis=1)

In [109]:
# df.head()

## Hashtag to data frame

In [110]:
hashtag_all = []
for j in range(len(tweets_data)):
    if j not in lst_error:
        lst = []
        for i in range(len(tweets_data[j]['entities']['hashtags'])):
            lst.append(tweets_data[j]['entities']['hashtags'][i]['text'])
        hashtag_all.append(lst)

In [111]:
# hashtag_all

In [112]:
df['hashtag'] = hashtag_all

In [113]:
# df.head()

-----------

In [114]:
df.to_csv('prepared03-4.csv', encoding = 'utf8')

In [92]:
# alltext = []

# try: # Check error in data
#     for i in range(len(tweets_data)):
#         if i not in lst_error: # ensure that message, not in the list of error.
#             if re.search('Microsoft PowerApps and Flow',tweets_data[i]['source']): # if tweet posted by Microsoft PowerApps, the key for access text will have only one then this condition must be checked first.
#                 text = tweets_data[i]['text']
#                 alltext.append(text)
#             elif re.search('^RT' ,tweets_data[i]['text']): # check type of tweet [RT = retweet]
#                 if re.search('…$' ,tweets_data[i]['text']): # check the message that full message or summary message.
#                     try: # in normal case
#                         text = tweets_data[i]['retweeted_status']['extended_tweet']['full_text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                         print(text)
# #                         print('--------------------')
#                     except: # in case of a full message not in key 'extended_tweet'.
#                         text = tweets_data[i]['retweeted_status']['text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                         print(text)
# #                         print('--------------------')
#                 else:
#                     if re.search('… https://' ,tweets_data[i]['text']): # check the text is condensed but the embedded link is unabridged.
#                         text = tweets_data[i]['extended_tweet']['full_text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                       print(text)
# #                       print('--------------------')
#                     else:
#                         text = tweets_data[i]['retweeted_status']['text'] # access text.
#                         alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
#             else: # in case of normal tweets
#                 if re.search('… https//' ,tweets_data[i]['text']): # check the text is condensed but the embedded link is unabridged.
#                     text = tweets_data[i]['extended_tweet']['full_text'] # access text.
#                     alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
#                 else:
#                     text = tweets_data[i]['text'] # access text.
#                     alltext.append(text) # append to the alltext list.
# #                     print(text)
# #                     print('--------------------')
# except:
#     print(f'ERROR FOUND!! INDEX : {i}')