In [1]:
import emoji 
import json
import datetime
import re
from collections import Counter
import timeit
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ijson

file_path = "data_tweets_with_emoji_json.txt"

In [2]:
print("Open corpus file...")
start_time = datetime.datetime.now()

with open(file_path, "r") as file:
    print("Strip corpus...")
    data = (line.strip() for line in file)
    data_json = "[{0}]".format(','.join(data))
    print("Load file as json...")
    data = json.loads(data_json)
    
end_time = datetime.datetime.now()
print("Dauer: ", end_time - start_time)

Open corpus file...
Strip corpus...
Load file as json...
Dauer:  0:00:38.006944


In [46]:
df = pd.DataFrame(data, columns=['id_str','text','in_reply_to_user_id_str'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392017 entries, 0 to 392016
Data columns (total 3 columns):
id_str                     392017 non-null object
text                       392017 non-null object
in_reply_to_user_id_str    10398 non-null object
dtypes: object(3)
memory usage: 9.0+ MB


In [5]:
non_nan_df= df.dropna() # dataframe ohne NaN

df.equals(non_nan_df)

False

In [45]:
# nur tweets ohne reply_id
no_reply_tweets = df[df.isnull().any(1)]
no_reply_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 381619 entries, 0 to 392016
Data columns (total 3 columns):
id_str                     381619 non-null object
text                       381619 non-null object
in_reply_to_user_id_str    0 non-null object
dtypes: object(3)
memory usage: 11.6+ MB


In [14]:
i=0
emojis = []

while i<len(data):
    emojis.append(''.join(c for c in df["text"].loc[i] if c in emoji.UNICODE_EMOJI))       
    i+=1
# die meist verwendeten Emojis und ihre Anzahl
Counter(emojis).most_common()

[('❤', 93270),
 ('😍', 39388),
 ('😂', 26317),
 ('💕', 24849),
 ('😊', 21511),
 ('😎', 20055),
 ('📷', 16379),
 ('💙', 14751),
 ('✨', 14616),
 ('😘', 13449),
 ('📸', 13414),
 ('😉', 13393),
 ('☀', 12532),
 ('😜', 11775),
 ('🔥', 11705),
 ('🎄', 11438),
 ('😁', 11301),
 ('💜', 10965),
 ('💯', 10909)]

In [61]:
reply_tweets = df[df['in_reply_to_user_id_str'].notnull()]

reply_tweets[reply_tweets.in_reply_to_user_id_str.isin(['3400842010'])]
#df[df.in_reply_to_user_id_str.isin(['3459258929'])]

Unnamed: 0,id_str,text,in_reply_to_user_id_str
391358,686738366596038656,@hulsey_g 💜 love my fave Hulsey,3400842010


In [60]:
reply_tweets

Unnamed: 0,id_str,text,in_reply_to_user_id_str
13,815332685581287424,@HSFBfan15 it's all Bama now 😉 What's your tak...,3459258929
41,661176222681063425,"""@QueenKIT26: Follow me on IG @msshilaunda"" #F...",407797430
47,732330418943352833,@katnhanblack making her Nico face. Sissie dat...,2869769803
121,661289655153856513,@DylanMcDermott @MaggieQ he's beautiful your S...,588570637
140,667014447496454144,@morganherington imy more😍,190419952
151,740136071082741760,"@MirandaFloyd8 yes, I love my trashy best frie...",801124040
195,792548517034225664,@On_Sightt Presents #unfilteredfridaysatl Each...,3354084106
309,662824278518317057,@oliviahill21 thanks lovie!❤️,766669788
364,665643982295343105,@johnny_L5 we gotchu 😎,232040031
416,791109990400614400,"@itskeva ! Mariah and I saw this and said ""Kev...",1120105320


In [65]:
df = pd.DataFrame(data, columns=['id_str','text','in_reply_to_user_id_str', 'retweeted_status'])
df[df['retweeted_status'].notnull()]
data[0]

{'in_reply_to_status_id_str': None,
 'in_reply_to_status_id': None,
 'possibly_sensitive': False,
 'coordinates': {'coordinates': [-117.919, 34.0567], 'type': 'Point'},
 'created_at': 'Wed Jan 20 09:08:00 +0000 2016',
 'truncated': False,
 'in_reply_to_user_id_str': None,
 'source': '<a href="http://instagram.com" rel="nofollow">Instagram</a>',
 'retweet_count': 0,
 'retweeted': False,
 'geo': {'coordinates': [34.0567, -117.919], 'type': 'Point'},
 'in_reply_to_screen_name': None,
 'is_quote_status': False,
 'entities': {'urls': [{'display_url': 'instagram.com/p/BAwViV1KIRn_…',
    'indices': [32, 55],
    'expanded_url': 'https://www.instagram.com/p/BAwViV1KIRn_PJ15OFzlN3muUxZ7M_lFIwhz1M0/',
    'url': 'https://t.co/ylNndaC0ls'}],
  'hashtags': [],
  'user_mentions': [],
  'symbols': []},
 'id_str': '689736119580233728',
 'in_reply_to_user_id': None,
 'favorite_count': 0,
 'id': 689736119580233728,
 'text': 'LoL 😂 @ West Covina, California https://t.co/ylNndaC0ls',
 'place': {'country