In [None]:
import pandas as pd
import csv
import jsonlines
import json
import chardet

import dateutil.parser
import datetime

In [None]:
# This is used to calculate the run time of a run through
# It is used, as I am working with rather large files and it helps me stay on top of planning the runs
# as well as my computer reaching it's limit or needing some time to cool off
start_time=dateutil.parser.parse(datetime.datetime.now().isoformat())

In [None]:
# This is only used when I'm doing closer inspection on the text bodies of the tweets
# pd.set_option('display.max_columns', None)

## Cleaning of split files

Originally I had daily files of tweets regarding the war in Ukraine. As those files were too large to process, I split them via Windows PowerShell into smaller files of a maximum of 800'000 tweets each.  

Those split files were then used for further processing.

In [None]:
# Using a text file as a log for how many Tweets end up in the cleaned files as well as the number of Tweets per keyword
Cleaning_Log_File = f'Cleaning_Log_File.txt'

Reading in the files:

In [None]:
# Entering file name without the data type ending, because I will use this part to create the new file names later on
file_name = 'input_file_name'
# Data type ending is added here
input_file = file_name +'.json'
 

In [None]:
input_file

To keep a better overview, as I worked with a lot of files, I created a log-file. I decided to use a conventional text file for this, but one could also create another type of file, for example a csv file.

In [None]:
# Appending log file with name of tweet file
with open(Cleaning_Log_File, "a") as file:
    file.write(input_file)

In [None]:
# Function to read in data in jsonl format
# Although it sais data is a json file, it turend out to be jsonl format.
# Therefore, json.load() could not be used
def load_jsonl(path):
    data=[]
    # encoding utf-8-sig is used, as there are some special characters in the data that lead to problems when using utf-8
    with open(path, 'r', encoding='utf-8-sig', errors='ignore') as reader: 
        for line in reader:
            data.append(json.loads(line))
    return data 

In [None]:
json_data=load_jsonl(input_file)

In [None]:
len(json_data)

In [None]:
# Appending log file with total number of Tweets
with open(Cleaning_Log_File, "a") as file:
    file.write('\nTotal Length: '+str(len(json_data)))

In [None]:
json_data[0:5]

### Filtering for English tweets:

In [None]:
# function to filter for English tweets
def only_english(data):
    json_data_en = []
    for entry in data:
        if "en" in entry["lang"]:
            json_data_en.append(entry)
    
    return json_data_en
            

In [None]:
json_data_en = only_english(json_data)

In [None]:
len(json_data_en)

In [None]:
# Appending log file with number of English Tweets
with open(Cleaning_Log_File, "a") as file:
    file.write('\nEnglish Tweets: '+str(len(json_data_en)))

In [None]:
json_data_en[0:2]

### Filtering for the data I want to keep per tweet:

There is a lot of data per tweet that is not needed. To reduce the data, anything that are not related to the tweets text, time of creation, identification, user, language or reetweet and favorite status, are removed. 


In [None]:
def filter_data(data):
    filtered_data = []
    for tweet in data:
        if "retweeted_status" in tweet.keys():
            tweet_data = {
                "id": tweet["id"],
                "id_str": tweet["id_str"],
                "created_at": tweet["created_at"],
                "full_text": tweet["full_text"],
                "retweeted_status": tweet["retweeted_status"],
                "user": tweet["user"],
                "retweet_count": tweet["retweet_count"],
                "favorite_count": tweet["favorite_count"],
                "favorited": tweet["favorited"],
                "retweeted": tweet["retweeted"],
                "lang": tweet["lang"]
                }
        else:
            tweet_data = {
                "id": tweet["id"],
                "id_str": tweet["id_str"],
                "created_at": tweet["created_at"],
                "full_text": tweet["full_text"],
                "user": tweet["user"],
                "retweet_count": tweet["retweet_count"],
                "favorite_count": tweet["favorite_count"],
                "favorited": tweet["favorited"],
                "retweeted": tweet["retweeted"],
                "lang": tweet["lang"]
                }
            
        filtered_data.append(tweet_data)
    
    return filtered_data 

In [None]:
filtered_data = filter_data(json_data_en)

In [None]:
len(filtered_data)

In [None]:
# Checking if everything looks as it should
filtered_data[0:3]

### Adding a column for the full retweet text

For retweets over 140 characters, the "full_text" does not show the complete content. But retweets contain the full text of the original tweet under "full_text" nested in the "retweeted_status". 

In [None]:
df_filtered_data = pd.DataFrame(filtered_data)

In [None]:
len(df_filtered_data)

In [None]:
df_filtered_data[0:2]

In [None]:
# as the text is nested in the data, this is a function to retrieve all the full retweet texts as a list
def get_retweeted_text(data):
    list_retweeted_text = []
    i=0
    for tweets in data:
        if "retweeted_status" in tweets.keys():
            retweeted_text = data[i]["retweeted_status"]["full_text"]
            list_retweeted_text.append(retweeted_text)
            i+= 1
            
        else:
            # as I want to add this list as a new column to my dataframe, I cannot skip lines, that aren't retweets
            # to not mess up the order of the texts
            list_retweeted_text.append("-")
            i+= 1
            
    
    return list_retweeted_text

In [None]:
# retrieving a list of the retweeted texts using my function
list_retweeted_text = get_retweeted_text(filtered_data)

In [None]:
list_retweeted_text[0:2]

In [None]:
# adding the list as a column to my dataframe
df_filtered_data["retweeted_status_full_text"] = list_retweeted_text

In [None]:
# Checking if it matches 
df_filtered_data[0:2]

In [None]:
df_filtered_data = df_filtered_data.drop('retweeted_status', axis=1)

# or alternatively:
# df_filtered_data = df_filtered_data.drop(columns='retweeted_status')

In [None]:
df_filtered_data[0:2]

### Filtering out duplicates

Here I want to check and filter out potential duplicates. There should not be any, but you never know and it is definitely better to check.

In [None]:
df_no_duplicates = df_filtered_data.drop_duplicates('id_str',keep='first')
# I have to go for id_str, as it is more reliable than id -> id as an integer gets corrupted sometimes because of its lenght

In [None]:
len(df_no_duplicates)

In [None]:
# Appending log file with number of unique English Tweets
with open(Cleaning_Log_File, "a") as file:
    file.write('\nEnglish Tweets w/o duplicates: '+str(len(df_no_duplicates)))

Saving cleaned data as a json file.  

When working with lage data, this will take a bit of time. Therefore, this step was not used for all files.

In [None]:
#output = df_no_duplicates.to_dict(orient="records")

In [None]:
#output[0:4]

In [None]:
# Save cleaned data into file
#with open(file_name+"_clean.json", "w") as twitter_data_file:
 #   json.dump(output, twitter_data_file, indent=4, sort_keys=True)

## Selecting Tweets regarding the chosen keywords: war, sanction, invasion and humanitarian crisis

In [None]:
# Only keeping data relevant to create the count from here on
df = df_no_duplicates.filter(["id_str","created_at","full_text","retweeted_status_full_text"], axis=1)

In [None]:
df[0:2]

#### War

For the keyword "war" I had to add a whitespace after the keyword -> "war_".  
This was necessary, as otherwise I would have gotten many tweets containing words like "warm" or "warning" instead of "war", because "war" is used in a variety of different words. By making sure, nothing comes after "war", the results only contain war-related tweets, as there are only very few, seldomly used words ending with war, that are not war related. A better solution would have been to use natural language processing methods, but this was not a feasible option within the time limits of this project for the amount of data that would have had to be processed.

In [None]:
# Creating dataframe only with the tweets containing "war" in full_text and/or the added retweeted_status_full_text
df_war = df[df['full_text'].str.contains("war ") | df['retweeted_status_full_text'].str.contains("war ")]

In [None]:
df_war[0:5]

In [None]:
len(df_war)

In [None]:
output = df_war.to_dict(orient="records")

In [None]:
output[0:2]

In [None]:
# Saving the slimmed down data of tweets containing the keyword "war" into seperate "war"-file
with open(file_name+"_war.json", "w") as twitter_data_file:
    json.dump(output, twitter_data_file, indent=4, sort_keys=True)

In [None]:
# Appending log file with number of tweets containing the string "war"
with open(Cleaning_Log_File, "a") as file:
    file.write('\nWar Tweets: '+str(len(df_war)))

#### Sanction

In [None]:
# Creating dataframe only with the tweets containing "sanction"
df_sanction = df[df['full_text'].str.contains("sanction") | df['retweeted_status_full_text'].str.contains("sanction")]

In [None]:
df_sanction[0:5]

In [None]:
len(df_sanction)

In [None]:
output = df_sanction.to_dict(orient="records")

In [None]:
# Saving the slimmed down data of tweets containing the keyword "sanction" into seperate "sanction"-file
with open(file_name+"_sanction.json", "w") as twitter_data_file:
    json.dump(output, twitter_data_file, indent=4, sort_keys=True)

In [None]:
# Appending log file with number of tweets containing the string "sanction"
with open(Cleaning_Log_File, "a") as file:
    file.write('\nSanction Tweets: '+str(len(df_sanction)))

#### Invasion

In [None]:
# Creating dataframe only with the tweets containing "invasion"
df_invasion = df[df['full_text'].str.contains("invasion") | df['retweeted_status_full_text'].str.contains("invasion")]

In [None]:
df_invasion[0:5]

In [None]:
len(df_invasion)

In [None]:
output = df_invasion.to_dict(orient="records")

In [None]:
# Saving the slimmed down data of tweets containing the keyword "invastion" into seperate "invasion"-file
with open(file_name+"_invasion.json", "w") as twitter_data_file:
    json.dump(output, twitter_data_file, indent=4, sort_keys=True)

In [None]:
# Appending log file with number of tweets containing the string "invasion"
with open(Cleaning_Log_File, "a") as file:
    file.write('\nInvasion Tweets: '+str(len(df_invasion)))

#### Humanitarian Crisis

In [None]:
# Creating dataframe only with the tweets containing "humanitarian crisis"
df_humanitarian_crisis = df[df['full_text'].str.contains("humanitarian crisis") | df['retweeted_status_full_text'].str.contains("humanitarian crisis")]

In [None]:
df_humanitarian_crisis[0:5]

In [None]:
len(df_humanitarian_crisis)

In [None]:
output = df_humanitarian_crisis.to_dict(orient="records")

In [None]:
# Saving the slimmed down data of tweets containing the keyword "humanitarian crisis" into seperate "humanitarian crisis"-file
with open(file_name+"_humanitarian_crisis.json", "w") as twitter_data_file:
    json.dump(output, twitter_data_file, indent=4, sort_keys=True)

In [None]:
# Appending log file with number of tweets containing the string "humanitarian crisis"
with open(Cleaning_Log_File, "a") as file:
    file.write('\nHumanitarian Crisis Tweets: '+str(len(df_humanitarian_crisis))+'\n')
    file.write('\n ---------------------------------------- \n \n')

The following part is used to check the time the code needed to run. As a large quantity of large files was processed, this helped planning run times and served as an indicator, that I had to stop other processes I was running simultaneously to not exhaust my machine and crash it.

In [None]:
end_time= dateutil.parser.parse(datetime.datetime.now().isoformat())

In [None]:
start_time

In [None]:
end_time

In [None]:
time_total = end_time - start_time
time_total