In [1]:
import json
import pandas as pd
import re
from collections import Counter
import os
from tqdm import tqdm

In [2]:
# Load json from data files
#Load Tweets


'''
Loads tweet data from a specific file (which contains multiple JSON tweets) into a panda dataframe
'''
def read_input(input_txt_file):
    tweets_list = []
    #print("Start Reading Text file containing multiple JSON objects -> Return Python dictionary (key-value) of JSON objets")
    with open(input_txt_file) as file:
        for jsonObject in file:
            if not jsonObject.isspace():
                tweets_Dict = json.loads(jsonObject)
                tweets_list.append(tweets_Dict)

    tweet_data_frame = pd.DataFrame.from_dict(tweets_list)
    #filter out retweeted tweets
    tweet_data_frame=tweet_data_frame[tweet_data_frame.retweeted==False]
    # Lowercase --> column TEXT
    tweet_data_frame["text"] = tweet_data_frame["text"].apply(lambda str: str.lower())
   
    return tweet_data_frame

In [3]:
def count_pronouns(tweet_texts):
    hen_patter = re.compile("[^a-z]hen[^a-z]")
    han_patter = re.compile("[^a-z]han[^a-z]")
    hon_patter = re.compile("[^a-z]hon[^a-z]")
    den_patter = re.compile("[^a-z]den[^a-z]")
    det_patter = re.compile("[^a-z]det[^a-z]")
    denna_patter = re.compile("[^a-z]denna[^a-z]")
    denne_patter = re.compile("[^a-z]denne[^a-z]")
    counters = {'hen':0,'han':0,'hon':0,'den':0,'det':0,'denna':0,'denne':0}
    # Loop through all tweets texts and count the pronouns
    for index, line in tweet_texts.items():
         # Hen Patter     
        if hen_patter.search(line):
            counters['hen'] += 1
        # “han” Patter       
        if han_patter.search(line):
            counters['han'] += 1
        # HON        
        if hon_patter.search(line):
             counters['hon'] += 1
               # DET        
        if den_patter.search(line):
            counters['den'] += 1
            # DET        
        if det_patter.search(line):
             counters['det'] += 1
        # DENNA        
        if denna_patter.search(line):
             counters['denna'] += 1
        # DENNE       
        if denne_patter.search(line):
            counters['denne'] += 1
    return counters
    


In [4]:
# %%time
# results = {}
# input_path = "./data"
# for filename in os.listdir(input_path):
#     tweet_data_frame = read_input(os.path.join(input_path, filename))
#     for index, line in tweet_data_frame.items():
#         result_dic = count_pronouns(tweet_data_frame['text'])
#         results = Counter(results) + Counter(result_dic)
#results

In [5]:
#------------------------------------ 

In [6]:
def count_pronouns_for1JSON(one_tweet_text):
    hen_patter = re.compile("[^a-z]hen[^a-z]")
    han_patter = re.compile("[^a-z]han[^a-z]")
    hon_patter = re.compile("[^a-z]hon[^a-z]")
    den_patter = re.compile("[^a-z]den[^a-z]")
    det_patter = re.compile("[^a-z]det[^a-z]")
    denna_patter = re.compile("[^a-z]denna[^a-z]")
    denne_patter = re.compile("[^a-z]denne[^a-z]")
    counters = {'hen':0,'han':0,'hon':0,'den':0,'det':0,'denna':0,'denne':0}
    # Loop through all tweets texts and count the pronouns
       # Hen Patter     
    if hen_patter.search(one_tweet_text):
        counters['hen'] += 1
    # “han” Patter       
    if han_patter.search(one_tweet_text):
        counters['han'] += 1
    # HON        
    if hon_patter.search(one_tweet_text):
         counters['hon'] += 1
           # DET        
    if den_patter.search(one_tweet_text):
        counters['den'] += 1
        # DET        
    if det_patter.search(one_tweet_text):
         counters['det'] += 1
    # DENNA        
    if denna_patter.search(one_tweet_text):
         counters['denna'] += 1
    # DENNE       
    if denne_patter.search(one_tweet_text):
        counters['denne'] += 1
    return counters

In [7]:
%%time
results ={}
counter = 0
# input_path = "./data/05cb5036-2170-401b-947d-68f9191b21c6"
#input_path = 'untitled.txt'
input_path = "./data"
for filename in tqdm(os.listdir(input_path)):
    with open(os.path.join(input_path, filename)) as file:
        for jsonObject in file:
            if not jsonObject.isspace():
                json_tweet = json.loads(jsonObject)  
                if (json_tweet['retweeted'] == False): 
                    text = json_tweet['text'].lower()
                    result = count_pronouns_for1JSON(text)
                    counter = counter +  1
            results = Counter(results) + Counter(result)

print(dict(results))
print(counter)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [01:00<00:00,  1.67it/s]

{'hon': 80306, 'han': 237298, 'det': 180874, 'den': 485848, 'hen': 8012, 'denne': 2600, 'denna': 7980}
452723
CPU times: user 55.6 s, sys: 2.7 s, total: 58.3 s
Wall time: 1min





In [8]:
#--------------------------------CELERY----------------------------------------------------------------------------------------
#First: Queue all Tasks first by counting the pronouns
#Second: 

In [9]:
%%time
# Celery task
from proj.tasks import addDict, mul, count_pronouns_for1JSON
results ={}
queuedTask =[]
#input_path = "./data/05cb5036-2170-401b-947d-68f9191b21c6"
#input_path = 'untitled.txt'
input_path = "./data"
for filename in tqdm(os.listdir(input_path)): #(os.path.join(input_path, filename)
    with open(os.path.join(input_path, filename)) as file:
        for jsonObject in file:
            if not jsonObject.isspace():
                json_tweet = json.loads(jsonObject)  
                if (json_tweet['retweeted'] == False): 
                    text = json_tweet['text'].lower()
                    queuedTask.append(count_pronouns_for1JSON.delay(text))


        
        
queuedTask


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [35:15<00:00, 20.95s/it]

CPU times: user 7min 13s, sys: 33.4 s, total: 7min 47s
Wall time: 35min 15s





[<AsyncResult: b1da6640-bf86-4961-aea7-dcf5ff91102e>,
 <AsyncResult: 4a877422-c3ae-4dd5-8571-f9832561664f>,
 <AsyncResult: cc663cbb-9ede-49cf-b2f7-41ad240c4775>,
 <AsyncResult: 392998d9-5803-401d-a8ea-b0be02b431b7>,
 <AsyncResult: 17c02c2f-f5a8-410d-97c7-9c942f2cc36d>,
 <AsyncResult: fdbc1895-f7de-40e9-bbb9-c28250174a87>,
 <AsyncResult: d880e3c0-5096-45a3-a805-a0a766248495>,
 <AsyncResult: 6e818852-22b1-4474-bc3a-7ac55fb414a4>,
 <AsyncResult: 9a4c5b63-cb6c-4481-a49e-e71b6ea7d2f5>,
 <AsyncResult: 2d5d18c2-2a3f-40c1-9e9a-05fceb6339e3>,
 <AsyncResult: 6af7c64b-0ad7-436a-9d8c-21996dad362f>,
 <AsyncResult: 96a46e3a-b964-4dfe-b018-4155ff349a70>,
 <AsyncResult: d232288e-1895-4010-ace9-b796c1d552cb>,
 <AsyncResult: 8980b208-4ce3-4306-bb2f-2302c4269dfb>,
 <AsyncResult: c7fcbd54-4ede-4b93-817c-14bd5e77b6a1>,
 <AsyncResult: f7289639-1b38-49c7-8ceb-d4eeb1bddfb6>,
 <AsyncResult: e3305eed-7af3-468e-90c1-c11f1623b3a9>,
 <AsyncResult: f125745c-2abb-4d6d-baaf-208162b4aa11>,
 <AsyncResult: 90a4e54f-22da

In [10]:
len(queuedTask)

452723

In [None]:
%%time
results = queuedTask[0].get()
for i in range(1,len(queuedTask)):
    results = Counter(results) + Counter(queuedTask[i].get())
    
results
    