# Text Analysis on gurlitt data

In [1]:
import numpy as np
import pandas as pd
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'D:\論文\PHEME9\Data\CSV\gurlitt.csv', encoding = 'utf-8', header = None)
df.columns = ['src_tweet_id', 'src_user_id', 'src_tweet','src_date','reply_tweet_id','reply_user_id','reply_tweet','reply_date', 'label']
df.reply_tweet = df.reply_tweet.replace(np.nan,"")

In [3]:
df.columns

Index(['src_tweet_id', 'src_user_id', 'src_tweet', 'src_date',
       'reply_tweet_id', 'reply_user_id', 'reply_tweet', 'reply_date',
       'label'],
      dtype='object')

In [4]:
df

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,reply_tweet_id,reply_user_id,reply_tweet,reply_date,label
0,535386605666979840,43092938,Will Bern's Museum of Fine Arts accept the con...,Thu Nov 20 10:58:08 +0000 2014,,,,,0
1,535391478969675776,38402632,The Gurlitt art collection no one - and everyo...,Thu Nov 20 11:17:30 +0000 2014,,,,,0
2,535415010361487360,719898644,German and Swiss handling of Gurlitt hoard and...,Thu Nov 20 12:51:00 +0000 2014,,,,,0
3,535415869862473729,1140049158,The Gurlitt art collection no one - and everyo...,Thu Nov 20 12:54:25 +0000 2014,,,,,0
4,535425236871544832,289989248,The Gurlitt art collection no one - and everyo...,Thu Nov 20 13:31:38 +0000 2014,,,,,0
...,...,...,...,...,...,...,...,...,...
159,536848134681690112,22138756,Summary of the #Gurlitt agreement between Kuns...,Mon Nov 24 11:45:44 +0000 2014,,,,,1
160,536848139836481536,16541374,Gurlitt collection accepted by Kunstmuseum Ber...,Mon Nov 24 11:45:45 +0000 2014,,,,,1
161,536848241556721665,553661747,Swiss museum accepts looted Nazi art http://t....,Mon Nov 24 11:46:09 +0000 2014,5.368550e+17,2.354298e+09,@WhyNowProject Mazel Tov! You are now featured...,Mon Nov 24 12:13:03 +0000 2014,1
162,536848662135980032,59608915,Gurlitt Collection Accepted by Kunstmuseum Ber...,Mon Nov 24 11:47:49 +0000 2014,,,,,1


In [5]:
src_tw_df = df.iloc[:,[0,1,2,3,8]].copy()
src_tw_df = src_tw_df.drop_duplicates()
src_tw_df.reset_index(drop=True,inplace=True)
src_tw_df

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,label
0,535386605666979840,43092938,Will Bern's Museum of Fine Arts accept the con...,Thu Nov 20 10:58:08 +0000 2014,0
1,535391478969675776,38402632,The Gurlitt art collection no one - and everyo...,Thu Nov 20 11:17:30 +0000 2014,0
2,535415010361487360,719898644,German and Swiss handling of Gurlitt hoard and...,Thu Nov 20 12:51:00 +0000 2014,0
3,535415869862473729,1140049158,The Gurlitt art collection no one - and everyo...,Thu Nov 20 12:54:25 +0000 2014,0
4,535425236871544832,289989248,The Gurlitt art collection no one - and everyo...,Thu Nov 20 13:31:38 +0000 2014,0
...,...,...,...,...,...
133,536848134681690112,22138756,Summary of the #Gurlitt agreement between Kuns...,Mon Nov 24 11:45:44 +0000 2014,1
134,536848139836481536,16541374,Gurlitt collection accepted by Kunstmuseum Ber...,Mon Nov 24 11:45:45 +0000 2014,1
135,536848241556721665,553661747,Swiss museum accepts looted Nazi art http://t....,Mon Nov 24 11:46:09 +0000 2014,1
136,536848662135980032,59608915,Gurlitt Collection Accepted by Kunstmuseum Ber...,Mon Nov 24 11:47:49 +0000 2014,1


## Prepocessing
- Define function to process text at one time

In [6]:
import pickle

In [7]:
'''Convert Emojis to Words'''

with open(r'D:\論文\PHEME9\Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    import re
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

In [8]:
def textprocessing(text):
 
    if 'RT' in text.split():      #cut retweet
        rt = text.find('RT')
        text = text[0:rt]  
        
    quo = text.find('“')           #cut quoting part
    quo_end = text.find('”')
    at = text.find('@')
    if quo != -1:                  #if find “
        if quo_end != -1:             # and if there is also ”
            text = text[0: quo:] + text[quo_end+1: :]
        else:
            text = text[0: quo:]
    elif quo_end != -1:
        text = text[0:at:] + text[quo_end+1: : ]
            
   
    text = convert_emojis_to_word(text)   #convert emoji to word
    text = text.lower()           #lower case    
     
    
    text = " ".join([word for word in text.split() if 'http' not in word    #remove url
                                and not word.startswith(('\'@','@','.@'))
                                and word != 'rt'])   
                 
    import re
    def remove_punct(tweet):
        new_words = []
        for word in tweet:
            w = re.sub(r'[^\w\s]','',word) #remove everything except words and space
            w = re.sub(r'_','',w) #remove underscore as well
            w = re.sub('[0-9]+','',w)
            new_words.append(w)

        return new_words
    text = "".join(remove_punct(text))

    from nltk.tokenize import TweetTokenizer
    tknzr = TweetTokenizer(strip_handles=True)
    text = tknzr.tokenize(text)
    
    from nltk.corpus import stopwords
    stop_words=set(stopwords.words("english"))
    stop_words=set([a for a in list(stop_words) if re.search("n't|no|not",a) ==None])
    stop_words.add('now')
    stop_words.add('amp')
    def filterstopwords(tw):
        filter_stopwords = []
        for w in tw:
            if w not in stop_words:
                filter_stopwords.append(w)
        return filter_stopwords
    text = filterstopwords(text)
    
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(i,'v') for i in text]
    
#     from nltk.stem.porter import PorterStemmer
#     stem = PorterStemmer()
#     text = [stem.stem(i) for i in text]
    
    text = " ".join(text)
    
    return text

In [21]:
print("Source tweet: ", df.src_tweet.iloc[66])
print("original text: ", df.reply_tweet.iloc[66])
print("cleaned text: ", textprocessing(df.reply_tweet.iloc[66]))

Source tweet:  #Gurlitt's overlooked heirs want control of his Nazi-era 'degenerate art' estate http://t.co/tmjMjzr5Zr http://t.co/jk0ij9ZmDy
original text:  Essential read for those still prepared to get through Gurlitt looted art case “@dw_culture http://t.co/7jhjICVGWr http://t.co/FDZ3wlsVWT”
cleaned text:  essential read still prepare get gurlitt loot art case


In [10]:
quotweet = df[df.reply_tweet.str.find('“') != -1].index
quo_tweet = df[df.reply_tweet.str.find('”') != -1].index
rtt = df[df.reply_tweet.str.find('RT') != -1].index
at =  df[df.reply_tweet.str.find('@') != -1].index

set(quo_tweet).difference(set(rtt)).difference(set(quotweet)).difference(set(at))


set()

In [18]:
df[df.reply_tweet.str.find('”') != -1].index

Int64Index([66, 114, 163], dtype='int64')

### applied on original data (**df**)

In [22]:
src_tw_df['cleaned_src_tw'] = src_tw_df.src_tweet.apply(textprocessing)
df['cleaned_reply_tw'] = df.reply_tweet.apply(textprocessing)

In [23]:
df.head()

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,reply_tweet_id,reply_user_id,reply_tweet,reply_date,label,cleaned_reply_tw
0,535386605666979840,43092938,Will Bern's Museum of Fine Arts accept the con...,Thu Nov 20 10:58:08 +0000 2014,,,,,0,
1,535391478969675776,38402632,The Gurlitt art collection no one - and everyo...,Thu Nov 20 11:17:30 +0000 2014,,,,,0,
2,535415010361487360,719898644,German and Swiss handling of Gurlitt hoard and...,Thu Nov 20 12:51:00 +0000 2014,,,,,0,
3,535415869862473729,1140049158,The Gurlitt art collection no one - and everyo...,Thu Nov 20 12:54:25 +0000 2014,,,,,0,
4,535425236871544832,289989248,The Gurlitt art collection no one - and everyo...,Thu Nov 20 13:31:38 +0000 2014,,,,,0,


In [24]:
src_tw_df.head()

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,label,cleaned_src_tw
0,535386605666979840,43092938,Will Bern's Museum of Fine Arts accept the con...,Thu Nov 20 10:58:08 +0000 2014,0,berns museum fine arts accept controversial gu...
1,535391478969675776,38402632,The Gurlitt art collection no one - and everyo...,Thu Nov 20 11:17:30 +0000 2014,0,gurlitt art collection no one everyone want sw...
2,535415010361487360,719898644,German and Swiss handling of Gurlitt hoard and...,Thu Nov 20 12:51:00 +0000 2014,0,german swiss handle gurlitt hoard bequest wron...
3,535415869862473729,1140049158,The Gurlitt art collection no one - and everyo...,Thu Nov 20 12:54:25 +0000 2014,0,gurlitt art collection no one everyone want sw...
4,535425236871544832,289989248,The Gurlitt art collection no one - and everyo...,Thu Nov 20 13:31:38 +0000 2014,0,gurlitt art collection no one everyone want sw...


In [None]:
src_tw_df.cleaned_src_tw.iloc[158]

## Sentiment Analysis

In [25]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [26]:
def get_sentiment(tweet):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(tweet)
    if vs['compound'] >=0.05:
        sentiment_label = 'Positive'
    elif (vs['compound'] > -0.05) & (vs['compound'] < 0.05):
        sentiment_label = 'Neutral'
    elif vs['compound']<= -0.05:
        sentiment_label = 'Negative' 
    result = sentiment_label
    return result

### applied on src tweet & reply tweet

In [27]:
src_tw_df['src_sentiment']= src_tw_df.cleaned_src_tw.apply(get_sentiment)
df['reply_sentiment']= df.cleaned_reply_tw.apply(get_sentiment)

In [28]:
df.head(1)

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,reply_tweet_id,reply_user_id,reply_tweet,reply_date,label,cleaned_reply_tw,reply_sentiment
0,535386605666979840,43092938,Will Bern's Museum of Fine Arts accept the con...,Thu Nov 20 10:58:08 +0000 2014,,,,,0,,Neutral


In [29]:
src_tw_df.head(1)

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,label,cleaned_src_tw,src_sentiment
0,535386605666979840,43092938,Will Bern's Museum of Fine Arts accept the con...,Thu Nov 20 10:58:08 +0000 2014,0,berns museum fine arts accept controversial gu...,Negative


In [30]:
df.to_csv(r'D:\論文\PHEME9\Data\CSV\gurlitt-df.csv',index=False)

In [31]:
src_tw_df.to_csv(r'D:\論文\PHEME9\Data\CSV\gurlitt-src.csv',index=False)