## CMPE 256 Summer 19

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from os import path
from PIL import Image
import os
import re

import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud, STOPWORDS

In [None]:
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')

## Scrape Tweets

In [None]:
# import twint
# c = twint.Config()
# c.Search = "the lion king"
# c.Store_json = True
# c.Output = "my_twitter_json.json"
# twint.run.Search(c)

## Read Data

In [None]:
df = pd.read_json('my_twitter_json.json',lines=True)
df.head(2)

In [None]:
df.dtypes

In [None]:
### to remove duplicated records
df.groupby('id').size()

In [None]:
df[df.id == 1157990430451142656]

In [None]:
df[df.id == 1157992220995117056]

In [None]:
df[df.id == 1157991907294736385]

In [None]:
## remove duplicated row by id
print("Before remove duplicates: " + str(df.shape))
df = df.drop_duplicates(subset = 'id', keep='first')
print("After remove duplicates: " + str(df.shape))

In [None]:
## Drop less important columns
df = df.drop(columns=['retweet','cashtags','video','retweets_count','replies_count','photos'])
df.head(3)

In [None]:
print(df.tweet[0])
print(df.urls[0])

In [None]:
for i in range(10):
    print(df.tweet[i])

In [None]:
## Notice there's usually a hyperlink at the end of the tweet, to analyze the text more precisely, would remove
## these website url links.

In [None]:
text = df.tweet[0]
print(text)

In [None]:
def remove_urls(text):
    r""" remove url links in text
    
    Args:
        text: string
    
    Returns:
        string
    """     
    text = re.sub(r"http\S+", "", text) #remove urls: http://XXXX
    text = re.sub(r"\S+.twitter.com\/\S+", "", text)
    return text

In [None]:
print(remove_urls(text))

In [None]:
df['tweet_no_url'] = df['tweet'].map(remove_urls)

In [None]:
df['tweet_no_url'][:10]

In [None]:
for i in range(20):
    print(df['tweet_no_url'][i])

In [None]:
df['just_date'] = df['date'].dt.date

In [None]:
df.shape

In [None]:
date_count = df.groupby('just_date').agg({'id': np.size})

In [None]:
date_count.head()

In [None]:
plt.style.use('ggplot')
ax = date_count.plot(kind='bar',figsize=(15,10),legend=False,color='c')
ax.set_title("Tweets Daily Frequency")
ax.set_xlabel("Date",fontsize=12)
ax.set_ylabel("# of Tweets",fontsize=12)
plt.show()

In [None]:
## All the records are from PDT Timezone
## Will drop timezone 
df.groupby('timezone').size()

In [None]:
df = df.drop(columns=['timezone'])
df.head(2)

In [None]:
df.time[10:13]

In [None]:
def extract_hour(string_time):
    r""" extract hour from string time: '09:01:03' -> 09
    
    Args:
        string_time: '09:01:03'
    
    Returns:
        Hour
    """      
    hour = int(string_time.split(":")[0])
    return hour

In [None]:
df['hour'] = df.time.map(extract_hour)

In [None]:
df.hour[0:4]

In [None]:
hourly_count = df.groupby('hour').agg({'id': np.size})
plt.style.use('ggplot')
ax = hourly_count.plot(kind='bar',figsize=(15,10),legend=False,color='c')
ax.set_title("Tweets Hourly Frequency")
ax.set_xlabel("Hour",fontsize=12)
ax.set_ylabel("# of Tweets",fontsize=12)
plt.show()

In [None]:
# concat all tweets into one text
all_text = ''
for x in df.tweet_no_url:
    all_text += x

In [None]:
wordcloud = WordCloud(max_font_size=40).generate(all_text)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
stop = stopwords.words('english')
more_stops = ['lion','king','the',"didn’t",'lionking','twitter','http','https','post','posts','movie','lion-king']
stop = stop + more_stops

In [None]:
text = df.tweet_no_url[2]
text = text.lower()
text

In [None]:
df.head(3)

In [None]:
stop = set(stopwords.words('english'))
more_stops = set(['lion','king','the',"didn’t",'lionking','thelionking',\
                  'twitter','http','https','post','posts','movie','lion-king',\
             "'s","via","go","gonna","wanna","going","got"])
MY_STOP = stop.union(more_stops)


def preprocess_text(text):
    r""" Preprocess text data
    
    Args:
        text: text list
    
    Returns:
        Preprocessed text list
    """    
    chars = set(['’','1','2','3','4','5','6','7','8','9','0','!',',','_','.','#','...','—','-','@','..',])
    # convert to lower case
    text = text.lower()
    text = re.sub(r"\S+.twitter.com\/\S+", "", text)
    text = re.sub(r"http\S+", "", text) #remove urls: http://XXXX
    texts = nltk.word_tokenize(text)
    # remove stop words
    result = [x for x in texts if x not in MY_STOP]
    result = [x for x in result if x not in chars]
    # extract stem word using nltk package
    return ' '.join(result)

In [None]:
print(df.tweet[9])
print(preprocess_text(df.tweet[9]))

In [None]:
df['tweet_clean'] = df['tweet'].map(preprocess_text)

In [None]:
df.to_csv('twitter_cleaned.csv')

In [None]:
clean_text = ''
for t in df.tweet_clean:
    clean_text += t


In [None]:
?str.replace

In [None]:
# read the mask image
lion_mask = np.array(Image.open("black_lion.png"))

wc = WordCloud(background_color="white", mask=lion_mask)
# generate word cloud
wc.generate(clean_text)
# store to file
wc.to_file("lion.png")
# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(lion_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
n_rows = df.shape[0]
n_cols = df.shape[1]
print(n_rows)
print(n_cols)

In [None]:
i = 0
for i in range(n_rows):
    if df.tweet_clean[i].find('twitter') > 0:
        break
print(i)

In [None]:
text = df.tweet_clean[73]
print(text)
## Need to remove short link pic.twitter.com/xxxx

In [None]:
text = df.tweet_clean[9]
print(text)
text = re.sub(r"\S+.twitter.com\/\S+", "", text)
print(text)

In [None]:
all_text[:300]

In [None]:
df.hashtags[:20]

In [None]:
### to extract all the hashtags

In [None]:
hashtags_all = []
for h in df.hashtags:
    if len(h) > 0:
        hashtags_all += h

In [None]:
h1 = df.hashtags[19]
h1

In [None]:
h2 = df.hashtags[18]
h2

In [None]:
h = h1 + h2
h

In [None]:
from collections import Counter

In [None]:
c = Counter(hashtags_all)

In [None]:
hashtag_freq = c.most_common(30)

In [None]:
hashtag_freq_name = [x[0] for x in hashtag_freq[::-1]]
hashtag_freq_freq = [x[1] for x in hashtag_freq[::-1]]

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(15,10))
plt.barh(hashtag_freq_name, hashtag_freq_freq,color='c')
plt.title('hashtag frequeency(top 30)')

In [None]:
hashtag_freq_name = [x[0] for x in hashtag_freq[::-1]][:28]
hashtag_freq_freq = [x[1] for x in hashtag_freq[::-1]][:28]
plt.style.use('ggplot')
plt.figure(figsize=(15,10))
plt.barh(hashtag_freq_name, hashtag_freq_freq,color='c')
plt.title('hashtag frequeency(top 30)')

In [None]:
df.tweet_clean[:90000].to_csv("only_tweets.csv")

In [None]:
df.shape

In [None]:
dates = df.date.unique()

In [None]:
dates

In [None]:
for date in dates:
    print(date)
    df_day = df[df.date == date].tweet
    file_name = "tweet_" + str(date)[:10] + ".csv"
    df_day.reset_index().tweet.to_csv(file_name,index=False)

In [None]:
df_day.reset_index().tweet.to_csv("t.csv",index=False)

## USE Google NLP APT to get Sentiment Score

In [None]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums
import six


def sample_analyze_sentiment(content):
    
    CLIENT = language_v1.LanguageServiceClient()
    
    type_ = enums.Document.Type.PLAIN_TEXT
    document = {'type': type_, 'content': content}

    try:
        response = CLIENT.analyze_sentiment(document)
        sentiment = response.document_sentiment
        result = sentiment.score
    except:
        result = 10
    return result
    # print('Magnitude: {}'.format(sentiment.magnitude))

In [None]:
content = 'Hello, world!'

In [None]:
print(sample_analyze_sentiment(content))

In [None]:
df['sentiment'] = df['tweet_no_url'].map(sample_analyze_sentiment)

In [None]:
def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

In [None]:
from google.cloud import storage
storage_client = storage.Client.from_service_account_json("/Users/xajin/Downloads/google_cred.json")

In [None]:
buckets = list(storage_client.list_buckets())

In [None]:
df.head(3)

In [None]:
df.dtypes

In [None]:
df.tweet_no_url[0]

In [None]:
sample_analyze_sentiment(df.tweet_no_url[0])

In [None]:
df.id[:3]

In [None]:
sentiment_scores = {}
for i in range(df.shape[0]):
    t_id = df.id[i]
    text = df.tweet_no_url[i]
    score = sample_analyze_sentiment(text)
    sentiment_scores[t_id] = score
    if i in (100000, 200000, 300000, 400000, 500000):
        print("%d texts analyzed" &i)

In [None]:
df.shape[0]

In [None]:
len(sentiment_scores)

In [None]:
scores = sentiment_scores.values()

In [None]:
scores = list(scores)

In [None]:
scores = [x for x in scores if x != 10]

In [None]:
sum(scores) / len(scores)

In [None]:
len(scores)

In [None]:
sentiment_scores

In [None]:
print(df[df.id == 1157984313473814528].tweet)
print(sentiment_scores[1157984313473814528])

## Word2Vec for Semantic Analysis

In [None]:
## Word2Vec
import gensim
from gensim.models import Word2Vec

In [None]:
train_text_list = []
for text in df.tweet_clean:
    train_text_list.append(text.split(' '))    
    

In [None]:
model = gensim.models.Word2Vec(
        sentences,
    from gensim.test.utils import common_texts, get_tmpfile
        window=10,
        min_count=10,
        alpha=0.02,
        workers=10)
model.train(sentences, total_examples=len(sentences), epochs=20)

In [None]:
model.wv.most_similar(['thelionking'],topn=20)

In [None]:
my_list = model.wv.most_similar(['lionking'],topn=30)

In [None]:
adjectives = [x for x in my_list if x[0].find('lion') < 0]

In [None]:
hakunamatata_n = model.wv.most_similar(['hakunamatata'],topn=30)