In [1]:
import json
import pandas as pd
import re
from collections import Counter
import os

In [2]:
# Load json from data files
#Load Tweets


'''
Loads tweet data from a specific file (which contains multiple JSON tweets) into a panda dataframe
'''
def read_input(input_txt_file):
    tweets_list = []
    #print("Start Reading Text file containing multiple JSON objects -> Return Python dictionary (key-value) of JSON objets")
    with open(input_txt_file) as file:
        for jsonObject in file:
            if not jsonObject.isspace():
                tweets_Dict = json.loads(jsonObject)
                tweets_list.append(tweets_Dict)

    tweet_data_frame = pd.DataFrame.from_dict(tweets_list)
    #filter out retweeted tweets
    tweet_data_frame=tweet_data_frame[tweet_data_frame.retweeted==False]
    # Lowercase --> column TEXT
    tweet_data_frame["text"] = tweet_data_frame["text"].apply(lambda str: str.lower())
   
    return tweet_data_frame

In [3]:
def count_pronouns(tweet_texts):
    hen_patter = re.compile("[^a-z]hen[^a-z]")
    han_patter = re.compile("[^a-z]han[^a-z]")
    hon_patter = re.compile("[^a-z]hon[^a-z]")
    den_patter = re.compile("[^a-z]den[^a-z]")
    det_patter = re.compile("[^a-z]det[^a-z]")
    denna_patter = re.compile("[^a-z]denna[^a-z]")
    denne_patter = re.compile("[^a-z]denne[^a-z]")
    counters = {'hen':0,'han':0,'hon':0,'den':0,'det':0,'denna':0,'denne':0}
    # Loop through all tweets texts and count the pronouns
    for index, line in tweet_texts.items():
         # Hen Patter     
        if hen_patter.search(line):
            counters['hen'] += 1
        # “han” Patter       
        if han_patter.search(line):
            counters['han'] += 1
        # HON        
        if hon_patter.search(line):
             counters['hon'] += 1
               # DET        
        if den_patter.search(line):
            counters['den'] += 1
            # DET        
        if det_patter.search(line):
             counters['det'] += 1
        # DENNA        
        if denna_patter.search(line):
             counters['denna'] += 1
        # DENNE       
        if denne_patter.search(line):
            counters['denne'] += 1
    return counters
    


In [4]:
# %%time
# results = {}
# input_path = "./data"
# for filename in os.listdir(input_path):
#     tweet_data_frame = read_input(os.path.join(input_path, filename))
#     for index, line in tweet_data_frame.items():
#         result_dic = count_pronouns(tweet_data_frame['text'])
#         results = Counter(results) + Counter(result_dic)
#results

In [5]:
#------------------------------------ 

In [6]:
def count_pronouns_for1JSON(one_tweet_text):
    hen_patter = re.compile("[^a-z]hen[^a-z]")
    han_patter = re.compile("[^a-z]han[^a-z]")
    hon_patter = re.compile("[^a-z]hon[^a-z]")
    den_patter = re.compile("[^a-z]den[^a-z]")
    det_patter = re.compile("[^a-z]det[^a-z]")
    denna_patter = re.compile("[^a-z]denna[^a-z]")
    denne_patter = re.compile("[^a-z]denne[^a-z]")
    counters = {'hen':0,'han':0,'hon':0,'den':0,'det':0,'denna':0,'denne':0}
    # Loop through all tweets texts and count the pronouns
       # Hen Patter     
    if hen_patter.search(one_tweet_text):
        counters['hen'] += 1
    # “han” Patter       
    if han_patter.search(one_tweet_text):
        counters['han'] += 1
    # HON        
    if hon_patter.search(one_tweet_text):
         counters['hon'] += 1
           # DET        
    if den_patter.search(one_tweet_text):
        counters['den'] += 1
        # DET        
    if det_patter.search(one_tweet_text):
         counters['det'] += 1
    # DENNA        
    if denna_patter.search(one_tweet_text):
         counters['denna'] += 1
    # DENNE       
    if denne_patter.search(one_tweet_text):
        counters['denne'] += 1
    return counters

In [7]:
%%time
results ={}
counter = 0
# input_path = "./data/05cb5036-2170-401b-947d-68f9191b21c6"
#input_path = 'untitled.txt'
input_path = "./data"
for filename in os.listdir(input_path):
    with open(os.path.join(input_path, filename)) as file:
        for jsonObject in file:
            if not jsonObject.isspace():
                json_tweet = json.loads(jsonObject)  
                if (json_tweet['retweeted'] == False): 
                    text = json_tweet['text'].lower()
                    result = count_pronouns_for1JSON(text)
                    counter = counter +  1
            results = Counter(results) + Counter(result)

print(dict(results))
print(counter)


{'hon': 80306, 'han': 237298, 'det': 180874, 'den': 485848, 'hen': 8012, 'denne': 2600, 'denna': 7980}
452723
CPU times: user 55.3 s, sys: 2.96 s, total: 58.2 s
Wall time: 1min 4s


In [4]:
#--------------------------------CELERY----------------------------------------------------------------------------------------
#First: Queue all Tasks first by counting the pronouns
#Second: 

In [4]:
%%time
# Celery task
from proj.tasks import addDict, mul, count_pronouns_for1JSON
results ={}
queuedTask =[]
input_path = "./data/05cb5036-2170-401b-947d-68f9191b21c6"
#input_path = 'untitled.txt'
#input_path = "./data"
#for filename in os.listdir(input_path): #(os.path.join(input_path, filename)
with open(input_path) as file:
    for jsonObject in file:
        if not jsonObject.isspace():
            json_tweet = json.loads(jsonObject)  
            if (json_tweet['retweeted'] == False): 
                text = json_tweet['text'].lower()
                queuedTask.append(count_pronouns_for1JSON.delay(text))


        
        
queuedTask


CPU times: user 6.9 s, sys: 668 ms, total: 7.57 s
Wall time: 1min 9s


[<AsyncResult: 26d61a6a-7242-4b8c-ac3b-15266bd69f5d>,
 <AsyncResult: 6d9bb421-30de-4de1-a32f-c9aefb6c42b3>,
 <AsyncResult: 627d4046-d3fd-401f-b01e-bde8a60f6985>,
 <AsyncResult: 3c7750c0-a1f4-41a5-b00f-b7cf870d5a57>,
 <AsyncResult: 264ca314-a20e-44f4-9f75-35bf08219bac>,
 <AsyncResult: 74ca24f6-b996-4549-a5e0-721fc96c54cc>,
 <AsyncResult: 0b33f8b0-5b0d-4286-9ee8-6ffd2570c954>,
 <AsyncResult: 0dd862b5-92c2-49f5-9a3f-4e4de753168f>,
 <AsyncResult: 4187b4c6-11be-4201-ad8a-0ca20a70d9e2>,
 <AsyncResult: 78dd5063-d63d-4145-a8e5-2c210de7538a>,
 <AsyncResult: 53fdd8cf-108f-4b34-a2a7-513d79f1f176>,
 <AsyncResult: 80b12aa8-384d-46b3-bd8d-b20b8859038a>,
 <AsyncResult: b7e5f078-f080-4fbe-88e2-e5f8330cb80d>,
 <AsyncResult: 7966f4a8-21c3-4ceb-ad09-2372fe340231>,
 <AsyncResult: 7fd9c255-76db-44ba-bf72-e2a9484a4a4f>,
 <AsyncResult: 06d0edc6-7d08-447d-b29b-93fb47d70d6c>,
 <AsyncResult: 5f305caf-b521-40d7-abe2-4bdf3b02c71e>,
 <AsyncResult: 5abbc18c-3f9b-4963-abec-fd050dd10e2f>,
 <AsyncResult: c0d3d4b1-fd04

In [None]:
len(queuedTask)

In [5]:
%%time
results = queuedTask[0].get()
for i in range(1,len(queuedTask)):
    results = Counter(results) + Counter(queuedTask[i].get())
    
results
    

CPU times: user 2.19 s, sys: 417 ms, total: 2.61 s
Wall time: 22.8 s


Counter({'hen': 64,
         'den': 3429,
         'han': 2076,
         'det': 1292,
         'hon': 437,
         'denne': 27,
         'denna': 49})

In [None]:
def add(x, y):
    return x + add()

In [26]:
from celery import group
#res = group(queuedTask[i].get() for i in range(0,len(queuedTask)))
res = group(queuedTask).get()

TypeError: get expected at least 1 argument, got 0