In [1]:
%reload_ext cypher
import matplotlib
import pandas as pd
import matplotlib.pyplot
import datetime
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
import operator
from itertools import islice
from tabulate import tabulate
pd.set_option('display.max_colwidth', -1)
import re

In [2]:
def clean_tweets(tweets):
    patternUrl = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    patternUsers =  re.compile('@(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    patternTags = re.compile('#(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    allTweets = pd.Series(tweets).str.cat(sep='\n|\n||\n')
    allTweets = patternUrl.sub('', allTweets) #removing urls 
    allTweets = patternUsers.sub('', allTweets) #removing users
    allTweets = patternTags.sub('', allTweets) #removing tags
    return allTweets.split('\n|\n||\n')

In [3]:
stop_words_ = set(stopwords.words('english'))  
new_set = set(['000', 'de', 'rt', 'http', 'https', 'amp', '1', '25', 'pm', '2', '”', '—'])
stop_words_ = stop_words_.union(new_set)

In [4]:
def getNGram(text, n, stop_words, m):
    punct_signs = list(string.punctuation)
    punct_signs.append('…')
    punct_signs.append('¿')
    for p in punct_signs:
        text = text.replace(p, ' ')
    clean_text = text.lower().split()
    clean_text = [w for w in clean_text if w not in stop_words_]
    total = len(clean_text)
    h_dict = {}   
    ngramas = ngrams(clean_text, n)
    for grams in ngramas:
        words = ' '.join(grams)        
        words = words.strip() 
        if words in h_dict:
            h_dict[words] = h_dict[words] + 1
        else: 
            h_dict[words] = 1 
    sorted_dict = sorted(h_dict.items(), key=operator.itemgetter(1), reverse=True)
    if m >=1:
        results = list(islice(sorted_dict,  m))
        results = [(v[0], v[1], v[1] * 100 / total) for v in results]
        return results
    else:
        return list(islice(sorted_dict, None))

In [6]:
ecos = [ 'BLM', 'MT', 'CCH', 'GC']

In [7]:
all_tweets_text = []
for eco in ecos:
    tweetsQ = %cypher match  (n:tweet)<-[r :TWEETS]-(n2:user) where n.eco = '{eco}'  return n.tid as tid, substring(n.text, 0, 10000000) as text, n.created_at as date
    tweets = tweetsQ.get_dataframe().text.unique()
    print(len(tweets), len(tweetsQ.get_dataframe().text))
    text = cleanTweets(tweets, eco)
    all_tweets_text.append(text)

61132 rows affected.
26761 61132
596858 rows affected.
219013 596858
40616 rows affected.
20478 40616
527675 rows affected.
130193 527675


## Unigrams, Bigrams, Trigrams in tweets by ecosystem

In [9]:
df_ngrams = []
for idx, eco in enumerate(ecos):
    print()
    final = getNGram(all_tweets_text[idx], 1, stop_words_, -1)
    final = final + getNGram(all_tweets_text[idx], 2, stop_words_, 100)
    final = [(x[0], eco.lower(), x[1]) for x in final]
    df = pd.DataFrame(final, columns=['Source', 'Target', 'Weight'])
    df_ngrams.append(df)
    print(eco, len(df))
    df.to_csv("ngrams/" + eco + "_edges_ngrams.csv", index=False)


BLM 33863

MT 220872

CCH 28672

GC 61330


In [10]:
df_ngrams = pd.concat(df_ngrams)

In [11]:
for eco in ecos:
    print(eco, df_ngrams[df_ngrams.Target==eco.lower()]['Weight'].mean())

BLM 10.55987360836311
MT 13.565096526494983
CCH 9.97802734375
GC 28.09254850807109


## Filtering Tweets in English

In [12]:
tweets_text = []
for eco in ecos:
    tweetsQ = %cypher match  (n:tweet)<-[r :TWEETS]-(n2:user) where n.eco = '{eco}' and n.lang = 'en' return n.tid as tid, substring(n.text, 0, 10000000) as text, n.created_at as date
    tweets = tweetsQ.get_dataframe().text.unique()
    print(len(tweets), len(tweetsQ.get_dataframe().text))
    text = cleanTweets(tweets, eco)
    tweets_text.append(text)

52663 rows affected.
22149 52663
363557 rows affected.
144056 363557
37086 rows affected.
18771 37086
491128 rows affected.
112213 491128


In [13]:
df_ngrams = []
for idx, eco in enumerate(ecos):
    print()
    final = getNGram(tweets_text[idx], 1, stop_words_, -1)
    final = final + getNGram(tweets_text[idx], 2, stop_words_, 100)
    final = [(x[0], eco.lower(), x[1]) for x in final]
    df = pd.DataFrame(final, columns=['Source', 'Target', 'Weight'])
    df_ngrams.append(df)
    print(eco, len(df))
    df.to_csv("ngrams/" + eco + "_en_edges_ngrams.csv", index=False)


BLM 29434

MT 84101

CCH 24139

GC 54621


In [14]:
%cypher match  (n:tweet) return n.eco, count(n)  as  Tweets 

4 rows affected.


n.eco,Tweets
CCH,40616
GC,527641
MT,596798
BLM,60960
