In [39]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amase\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amase\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amase\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [44]:
annotations_json = pd.read_json('./dataset/tweets_raw.json')

In [45]:
annotations_json

Unnamed: 0,Corpus,Tweets
0,ALM,"[{'tweet_id': '521033092132503552', 'tweet_tex..."
1,Baltimore,"[{'tweet_id': '593899776564944897', 'tweet_tex..."
2,BLM,"[{'tweet_id': '734202176684298240', 'tweet_tex..."
3,Davidson,"[{'tweet_id': '5', 'annotations': [{'annotator..."
4,Election,"[{'tweet_id': '509464992404357120', 'tweet_tex..."
5,MeToo,"[{'tweet_id': '48430014437122048', 'tweet_text..."
6,Sandy,"[{'tweet_id': '258018822945120256', 'tweet_tex..."


In [46]:
#Validity Checker

sum = 0 
total_tweet = 0
v = []
moral_category = {}

for corpus_i in range(len(annotations_json['Corpus'])):
    valid = 0 
    for tweet in annotations_json['Tweets'][corpus_i]:
        total_tweet += 1
        if 'tweet_text' in tweet:
            sum += 1 
            if tweet['tweet_text'] != 'no tweet text available':
                valid += 1
        for annotation in tweet['annotations']:
            annotations = annotation['annotation'].split(',')
            for i in annotations:
                if i not in moral_category:
                    moral_category[i] = 0
                else:
                    moral_category[i] += 1
    v.append(valid)
                
print("total tweet:",total_tweet,"\ntotal tweet text:",sum,"\nvalid tweet text",v)
            

#Setting up global variable
moral_OHL = {}
moral_OHL['care'] = 0
moral_OHL['purity'] = 1
moral_OHL['subversion'] = 2 
moral_OHL['loyalty'] = 3 
moral_OHL['harm'] = 4
moral_OHL['cheating'] = 5 
moral_OHL['fairness'] = 6 
moral_OHL['non-moral'] = 7 
moral_OHL['betrayal'] = 8 
moral_OHL['authority'] = 9 
moral_OHL['degradation'] =10

total tweet: 34987 
total tweet text: 30114 
valid tweet text [3243, 3942, 3960, 0, 4147, 0, 3673]


In [180]:
# get tweets and annotations for each individual annotator

# DATA-SET1 Individual corpus with individual annotator 
# DATA-SET2 Individual corpus with mix annotator 
# DATA-SET3 Mix corpus with mix annotator 
# DATA-SET4 Mix corpus with Individual annotator 

import re

annotators_tweets = {}

for corpus_i in range(len(annotations_json['Corpus'])):
    for tweet in annotations_json['Tweets'][corpus_i]:
        for annotation in tweet['annotations']:
            
            # make sure annotator exists in the dictionary
            if annotation['annotator'] not in annotators_tweets:
                annotators_tweets[annotation['annotator']] = []
                
            if 'tweet_text' in tweet:
                if tweet['tweet_text'] != 'no tweet text available':
                    
                    #Step 1 Basic Clean up
                    text = re.sub(r'http\S+', '', tweet['tweet_text'])
                    
                    #Getting rid off @
                    text = re.sub(r'@\S+', '', text)
                    
                    #Getting rid off hashtag
                    text = re.sub(r'#\S+', '', text)
                    
                    #Getting rid off &
                    text = re.sub(r'&amp', '', text)
            
                    #Step 2 - Lowercasing 
                    text = text.lower()
                    
                    #Step 3 - Tokenization
                    text = word_tokenize(text)

                    #Step 4 - Stopwords removal 
                    stop_words = set(stopwords.words('english')) 
                    text = [i for i in text if not i in stop_words]
                    
                    #Step 5 - Stemming
                    newtext = []
                    for i in text:
                        newtext.append(ps.stem(i))
                        
                    #Step 6 - lemmatization
                    new = []
                    for i in newtext:
                        new.append(lemmatizer.lemmatize(i))
                        
                    text = " ".join(new)
  
                    #Setting up onehot for annotations. 
                    annotations = annotation['annotation'].split(',')
                    
                    moral_labels = [0 for i in range(11)]
                    
                    for i in annotations:
                        if i in moral_OHL:
                            moral_labels[moral_OHL[i]] = 1
            
                    # arrange data format of each tweet 
                    new_tweet = { 'tweet': text,
                                 'moral labels': moral_labels,
                                 'corpus': annotations_json['Corpus'][corpus_i],
                                'tweet_id': tweet['tweet_id'],
                                'annotations': annotations, 
                                }
                    annotators_tweets[annotation['annotator']].append(new_tweet)

In [178]:
# visualize annotator01's first 5 tweets with annotations
annotators_tweets['annotator02'][:10]

[{'tweet': 'wholeheartedli support protest ; act civil disobedi ; join !',
  'moral labels': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  'corpus': 'ALM',
  'tweet_id': '537681598989475841',
  'annotations': ['loyalty']},
 {'tweet': 'sandra bland situat man disrespect rest soul , peopl die everyday unjustifi matter',
  'moral labels': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
  'corpus': 'ALM',
  'tweet_id': '624644420705648640',
  'annotations': ['cheating']},
 {'tweet': 'commit peac , heal love neighbor . give u strength patienc .',
  'moral labels': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'corpus': 'ALM',
  'tweet_id': '752979765984890884',
  'annotations': ['care', 'purity']},
 {'tweet': 'injustic one injustic',
  'moral labels': [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  'corpus': 'ALM',
  'tweet_id': '548029362348765185',
  'annotations': ['care', 'loyalty', 'purity']},
 {'tweet': 'compass look like !',
  'moral labels': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'corpus': 'ALM',
  'tweet_id': '75326122473789

In [179]:
#13 annotators 

data_tweets = {}

for annotator in annotators_tweets:
    if len(annotators_tweets[annotator]) > 500:
        data_tweets[annotator] = annotators_tweets[annotator]
        print(annotator, ', annotated amount:', len(data_tweets[annotator]))

annotator01 , annotated amount: 3106
annotator02 , annotated amount: 3213
annotator03 , annotated amount: 3234


In [14]:
# DATA-SET1  Individual annotator with Individual corpus with
# DATA-SET2  Individual annotator with mixed corpus 
# DATA-SET3  Mixed annotator with individual corpus  
# DATA-SET4  Mixed annotator with mixed corpus 
import json

In [19]:
# DATA-SET1 Individual annotator with Individual corpus with

data_set1 = {}

for annotator in data_tweets:
    if annotator not in data_set1:
        data_set1[annotator] = {}
    for i in data_tweets[annotator]:
        if i['corpus'] not in data_set1[annotator]:
            data_set1[annotator][i['corpus']] = [[],[]]
        else:
            data_set1[annotator][i['corpus']][0].append(i['tweet'])
            data_set1[annotator][i['corpus']][1].append(i['moral labels'])
                

print(data_set1['annotator02']['ALM'][0][:5])
print(data_set1['annotator02']['ALM'][1][:5])

with open('./dataset/iaic.json', 'w') as outfile:
    json.dump(data_set1, outfile)

['This Sandra Bland situation man no disrespect rest her soul , but people die everyday in a unjustified matter #AllLivesMatter', 'Commitment to peace, healing and loving neighbors. Give us strength and patience. #PortlandPride #AllLivesMatter #Peace', 'Injustice for one is an injustice for all #AllLivesMatter  #AntonioMartin', 'This is what compassion looks like! #vegan #AllLivesMatter ', 'Liberty and Justice for all? How about opportunity for all. #blacklivesmatter #alllivesmatter']
[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [20]:
# DATA-SET2 Individual annotator with mixed corpus 

data_set2 = {}

for annotator in data_tweets:
    if annotator not in data_set2:
        data_set2[annotator] = [[],[]]
    for i in data_tweets[annotator]:
        data_set2[annotator][0].append(i['tweet'])
        data_set2[annotator][1].append(i['moral labels'])
                

print(data_set2['annotator02'][0][:5])
print(data_set2['annotator02'][1][:5])

with open('./dataset/iamc.json', 'w') as outfile:
    json.dump(data_set2, outfile)

['Wholeheartedly support these protests ; acts of civil disobedience ; will join when I can! #Ferguson #AllLivesMatter ', 'This Sandra Bland situation man no disrespect rest her soul , but people die everyday in a unjustified matter #AllLivesMatter', 'Commitment to peace, healing and loving neighbors. Give us strength and patience. #PortlandPride #AllLivesMatter #Peace', 'Injustice for one is an injustice for all #AllLivesMatter  #AntonioMartin', 'This is what compassion looks like! #vegan #AllLivesMatter ']
[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [21]:
# DATA-SET3  Mixed annotator with individual corpus  

data_set3 = {}

for annotator in data_tweets:
    for i in data_tweets[annotator]:
        if i['corpus'] not in data_set3:
            data_set3[i['corpus']] = [[],[]]
        else:
            data_set3[i['corpus']][0].append(i['tweet'])
            data_set3[i['corpus']][1].append(i['moral labels'])
                
print(data_set3['ALM'][0][:5])
print(data_set3['ALM'][1][:5])

with open('./dataset/maic.json', 'w') as outfile:
    json.dump(data_set3, outfile)

['This Sandra Bland situation man no disrespect rest her soul , but people die everyday in a unjustified matter #AllLivesMatter', 'Commitment to peace, healing and loving neighbors. Give us strength and patience. #PortlandPride #AllLivesMatter #Peace', 'Injustice for one is an injustice for all #AllLivesMatter  #AntonioMartin', 'This is what compassion looks like! #vegan #AllLivesMatter ', 'Black Twitter when they see someone tweet #AllLivesMatter ']
[[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]


In [22]:
# DATA-SET4  Mixed corpus with mixed annotator 

data_set4 = [[],[]]

for annotator in data_tweets:
    for i in data_tweets[annotator]:
        data_set4[0].append(i['tweet'])
        data_set4[1].append(i['moral labels'])
                
print(data_set4[0][:5])
print(data_set4[1][:5])

with open('./dataset/mamc.json', 'w') as outfile:
    json.dump(data_set4, outfile)

['Wholeheartedly support these protests ; acts of civil disobedience ; will join when I can! #Ferguson #AllLivesMatter ', 'This Sandra Bland situation man no disrespect rest her soul , but people die everyday in a unjustified matter #AllLivesMatter', 'Commitment to peace, healing and loving neighbors. Give us strength and patience. #PortlandPride #AllLivesMatter #Peace', 'Injustice for one is an injustice for all #AllLivesMatter  #AntonioMartin', 'This is what compassion looks like! #vegan #AllLivesMatter ']
[[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [43]:
a = ['1','2','3']
b = '  '.join(a)
b

'1  2  3'

In [165]:
text = "running towards a tree #hunt #worldwide #alllivesmatter "

#Getting rid off @
text = re.sub(r'@\S+', '', text)

#Getting rid off hashtag
text = re.sub(r'#\S+', '', text)

#Getting rid off &
text = re.sub(r'&amp', '', text)

#Getting rid off emoji, new lines
text = re.sub(r'\\\\u....', '', text)

#Step 2 - Lowercasing 
text = text.lower()

#Step 3 - Tokenization
text = word_tokenize(text)

#Step 3 - Stopwords removal 

stop_words = set(stopwords.words('english')) 
text = [i for i in text if not i in stop_words]

 
#Step 5 - Stemming
newtext = []
for i in text:
    newtext.append(ps.stem(i))

#Step 6 - lemmatization
new = []
for i in newtext:
    new.append(lemmatizer.lemmatize(i))
    
    
new = " ".join(new)

new

'run toward tree'