In [None]:
import pandas as pd

# Load CSV into dataframe
df = pd.read_csv ('tweets.csv')

In [None]:
df.head()

## Most mentioned word in Word Cloud

In [None]:
%%time
# Reference : https://malaya.readthedocs.io/en/latest/Api.html#module-malaya.preprocessing
import malaya

tokenizer = malaya.preprocessing.Tokenizer(hashtags= False)
extracbas = malaya.stem.deep_model()
tweetlist = []

for tweet in df['tweet_text']:
    tkn = tokenizer.tokenize(tweet)
    for t in tkn:
        tkn = malaya.preprocessing.unpack_english_contractions(t)
        basetkn = extracbas.stem(tkn,beam_search= False)
    tweetlist.append(basetkn)

tweetlist


In [None]:
# Set Custom Stopwords
from nltk.corpus import stopwords

new_words = ["co","http","di","yang","dan","amp","ini", "untuk", "n", "ni", "ada", "kami", "yg", "ke", "1", "2", "nak", "daruratbanjir", "3", "lagi", "tak", "kita"]
stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(set(new_words))

In [None]:
#Define function to lemmatise data and add stop word

import nltk
def cleanData(raw_text):    
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem.wordnet import WordNetLemmatizer
    
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(raw_text)

    wordsFiltered=[]
    for word in words:
        if word.lower() not in stop_words:
            wordsFiltered.append(word)
    
    lem = WordNetLemmatizer()
    wordsLemmatized=[]
    #Lemmatisation
    for word in wordsFiltered:
        wordsLemmatized.append(lem.lemmatize(word))
    
#   Convert to lowercase
    str=''
    for w in wordsLemmatized:
        str = str+' '+w.lower()
    return str

In [None]:
# Observe stopwords and add to new_words
import pandas

df_word = df.apply(lambda row:cleanData(row['tweet_text']), axis = 1) # apply function to each tweet_t
freq = pandas.Series(''.join(df_word).split()).value_counts()[0:40]
print(freq)

In [None]:
# Create Wordcloud and Save

import os
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator

import matplotlib.pyplot as plt
%matplotlib inline

#used to convert data element into list
abstract = df_word.values.tolist() 

# Create wordcloud
wordcloud = WordCloud(background_color='white', stopwords=stop_words, max_words=100,max_font_size=50, random_state=42).generate(str(abstract))

# Display the generated image:
plt.figure(figsize=(10,10)) #inches
plt.axis("off")
plt.imshow(wordcloud, interpolation='bilinear')

plt.show()

# Save wordcloud to django static folder
path = os.getcwd() + r"\WeWarnYou-dashboard\apps\static\assets\wc-word.png"
wordcloud.to_file(path)

## Most mentioned city in Word Cloud

In [None]:
# Remove empty bracket city
df_city = df[df.city != "[]"]

# Remove empty value city
df_city = df_city.dropna(subset=['city'])

df_city

In [None]:
# Cleaning city data from "['Putrajaya']" become "Putrajaya" & Add to an empty wc_city list

import re

wc_city_list = []
characters_to_remove = "\[\]\'"
pattern = "[" + characters_to_remove + "]"

for city in df_city['city']:
    wc_city_list.extend(re.sub(pattern, "", city).split(','))

print(wc_city_list)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

wc_city = (" ").join(wc_city_list)
wordcloud = WordCloud(background_color="white", repeat=False).generate(wc_city)

# Display the generated image:
plt.figure(figsize=(10,10)) #inches
plt.axis("off")
plt.imshow(wordcloud, interpolation='bilinear')

plt.show()

# Save wordcloud to django static folder
path = os.getcwd() + r"\WeWarnYou-dashboard\apps\static\assets\wc-location.png"
wordcloud.to_file(path)

## Count of flood-related hashtag against time line graph

In [None]:
# Split timestamp into date and timestamp
timestamp = df["created_at"].str.split(" ", n = 1, expand = True)
df["date"] = pd.to_datetime(timestamp[0])
df["time"] = timestamp[1]

In [None]:
# Group the tweets by date into new dataframe
df_count = df.groupby(df.date.dt.floor('1D')).count()
df_count

In [None]:
# Convention for import of the pyplot interface
import matplotlib.pyplot as plt

# Set-up to have matplotlib use its support for notebook inline plots
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))

# Specify how our lines should look
ax.plot(df_count.tweet_text, color='tab:blue', label='Count')

# Same as above
ax.set_xlabel('Date')
ax.set_ylabel('Count')
ax.set_title('Flood-related Tweets')
ax.grid(True)
ax.legend(loc='upper left');


# Save line graph to django static folder
path = os.getcwd() + r"\WeWarnYou-dashboard\apps\static\assets\flood-trend.png"
plt.savefig(path)

## Most mentioned party in tweets

In [None]:
# Declare Malaysia Party, as of Jan 2022

df_party = pd.DataFrame([
    ['PH', 'DAP'],['PH', 'PKR'],['PH', 'AMANAH'],
    ['PN', 'BERSATU'],['PN', 'PAS'],['PN', 'GERAKAN'],
    ['BN', 'UMNO'],['BN', 'MCA'],['BN', 'MIS']
], columns=['Party','Member'])

df_party

In [None]:
# Count how many times those party were mentioned

# Declare empty list
count = []

for member in df_party['Member']:
    
    # Create a regex for the party to search for uppercase and lowercase and must have space before and after
    # For eg, DAP -> "\sDAP|dap\s"
    reg = "\s" + member + "|" + member.lower() + "\s"
    s = df['tweet_text'].str.count(reg).sum()
    count.append(s)
    
df_party['Count'] = count
df_party

In [None]:
# Following nested chart are reference through this site
# Ref 1 - https://stackoverflow.com/questions/67210640/how-can-i-draw-a-nested-pie-graph-in-matplotlib-in-python
# Ref 2 - https://matplotlib.org/stable/gallery/pie_and_polar_charts/nested_pie.html

In [None]:
df_party_outer = df_party.groupby(['Party']).sum()
df_party_outer

In [None]:
df_party_inner = df_party.groupby(['Party', 'Member']).sum()

df_party_inner

In [None]:
# Define inner labels for inner pie chart
inner_labels = df_party_inner.index.get_level_values(1)
inner_labels

In [None]:
# Define color for pie chart
cmap = plt.cm.get_cmap("tab20c")
outer_colors = cmap(np.arange(3)*4)
inner_colors = cmap([1, 2, 3, 4, 5, 6, 8, 9, 10])

In [None]:
# Creating autocpt arguments 
def func(pct, allvalues): 
    absolute = int(pct / 100.*np.sum(allvalues)) 
    return "{:.1f}%\n({:d})".format(pct, absolute) 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(40,20))
size = 0.3

ax.pie(df_party_outer.values.flatten(), radius=1,
       labels=df_party_outer.index,
       autopct = lambda pct: func(pct, df_party_outer),
       pctdistance = 0.9,
       colors=outer_colors,
       wedgeprops=dict(width=size, edgecolor='black'))

ax.pie(df_party_inner.values.flatten(), radius=1-size, 
       labels = inner_labels,
       autopct = lambda pct: func(pct, df_party_inner),
       pctdistance = 0.9,
       colors=inner_colors,
       labeldistance = 0.4,
       rotatelabels = True,
       wedgeprops=dict(width=size, edgecolor='black'))

# Save pie chart to django static folder
path = os.getcwd() + r"\WeWarnYou-dashboard\apps\static\assets\pc-party.png"

plt.savefig(path, bbox_inches="tight")

#plt.show()