# Splitting of tweets into xlsx-files

This notebook is to divide 520 tweets to eight xlsx-files, so that every tweet will be labelled by three persons. This means that each member of the group will get a excel-file that has 195 tweets.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import re

#Imports for encrypting
import numpy as npv
import base64

from getpass import getpass
from cryptography.fernet import Fernet
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC

## Test and train sets

In [None]:
#reads data from a file
data_filepath = r'C:\Users\Inka\Downloads\kws_final_01-04-2020.json'

data_arr = []
column_names = ['id','text']
with open(data_filepath, 'r') as f:
    for tweet in f:
        selected_row = []
        json_tweet = json.loads(tweet)
        try:
            # filter out retweets and non-English tweets:
            #Extended tweets
            if not json_tweet['retweeted'] and 'RT @' not in json_tweet['text'] and json_tweet['lang'] == 'en' and json_tweet['extended_tweet'] != False: #has extended
                tweet_id = json_tweet['id']
                text = json_tweet['extended_tweet']['full_text']
                new_list = [tweet_id, text]
                data_arr.append(new_list)
            #Not extended 
            elif not json_tweet['retweeted'] and 'RT @' not in json_tweet['text'] and json_tweet['lang'] == 'en':
                print('x')
                for col in column_names:
                    selected_row.append(json_tweet[col])
                data_arr.append(selected_row)
        except KeyError:
            pass
    data = pd.DataFrame(data_arr, columns=column_names)

In [15]:
#Each 8 members labels 195 tweets.
s = 520 #sample size
n = 3 #the amount that each tweet is labelled
p = 8 #number of people

In [None]:
#Collects 520 random tweets for labelling from the final data
data_set, label_set = train_test_split(data, test_size = s)

In [None]:
print(label_set)
#Saves the original set:
label_set.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Original_set.xlsx', index = False)

In [None]:
id_list = label_set['id'].tolist()

## Pseudonymization

In [3]:
def initialize_crypto(password):
    """
    Derives crypto key using the password and initializes the crypto library
    that can be then called via encrypt and decrypt functions.
    -----
    Returns: cryptography.fernet.Fernet object
    """
    password=password.encode()
    salt = b'm\xfffFvxfb\xbexB\x7f2\xaa\x1dj\x8c\x8f\xf1\\{' 
    kdf = PBKDF2HMAC(
        algorithm=hashes.SHA256(),
        length=32,
        salt=salt,
        iterations=100000,
        backend=default_backend()
    )

    return Fernet(base64.urlsafe_b64encode(kdf.derive(password)))

In [4]:
def encrypt_ids(ids, password):
    """
    Encrypts a list of ids. Works on both strings and ints.
    -----
    Returns: list of bytes-typeobjects
    """
    
    crypto = initialize_crypto(password)
    if isinstance(ids[0], int):
        return [crypto.encrypt(bytes(str(ID), 'utf-8')) for ID in ids]
    else:
        return [crypto.encrypt(bytes(ID, 'utf-8')) for ID in ids] 

In [5]:
def decrypt_ids(encr_ids, password):
    """
    Decrypts a list of bytes objects..
    -----
    Returns: list of strings
    """
    
    crypto = initialize_crypto(password)
    keys = [crypto.decrypt(encr_ID).decode("utf-8") for encr_ID in encr_ids] 
    try:
        return [int(k) for k in keys]
    except Exception as ex:
        return keys

### Input password and encrypt

In [6]:
password = getpass("Please enter a passphrase > : \n")    # Reads what user inputs

Please enter a passphrase > : 
········


In [None]:
#Encrypt
new_id_list = encrypt_ids(id_list, password)

In [7]:
#Decrypt
T = decrypt_ids([b'gAAAAABehk1E1wAAecSppFGs_lw7HHq2_Ru0WKwDl-ZwQH-e2aC0DJbzYM6VH4wp1CS8U-K2_VU25sM0Bk_qOvviBkPiJbIqiBtk4-J6AjNe9BpMF8T3_gk='], password)

In [8]:
T

[1245335133659545605]

## Dividing the data into xlsx-files

In [None]:
label_set

In [None]:
label_set = label_set.drop(columns=['id'])
label_set = np.c_[new_id_list, label_set]
label_set = pd.DataFrame(label_set, columns=['id', 'text'])

In [None]:
label_set

In [None]:
#adds extra column for labelling
label_set["label"] = "" #label_df

#Splits the new_df into number of p dataframes:
df_split = np.array_split(label_set, p)

df1 = df_split[0].append([df_split[1], df_split[2]])
df2 = df_split[1].append([df_split[2], df_split[3]])
df3 = df_split[2].append([df_split[3], df_split[4]])
df4 = df_split[3].append([df_split[4], df_split[5]])
df5 = df_split[4].append([df_split[5], df_split[6]])
df6 = df_split[5].append([df_split[6], df_split[7]])
df7 = df_split[6].append([df_split[7], df_split[1]])
df8 = df_split[7].append([df_split[1], df_split[2]])

In [None]:
df1.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Inka.xlsx', index = False)
df2.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maria.xlsx', index = False)
df3.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Ville.xlsx', index = False)
df4.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maryam.xlsx', index = False)
df5.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Annika.xlsx', index = False)
df6.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Veera.xlsx', index = False)
df7.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Nuutti.xlsx', index = False)
df8.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Estanislao.xlsx', index = False)

In [9]:
#Let's read the exels
lb1 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Inka.xlsx')
lb2 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maria.xlsx')
lb3 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Ville.xlsx')
lb4 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maryam.xlsx')
lb5 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Annika.xlsx')
lb6 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Veera.xlsx')
lb7 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Nuutti.xlsx')
lb8 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Estanislao.xlsx')

In [10]:
lb2.head()

Unnamed: 0,id,text,label
0,b'gAAAAABehk1E1AgtNCoFyOt2S2_1z1FW30NViLQM15xd...,@StompyIsAwesome @CraigSJ I’m not gonna argue ...,
1,b'gAAAAABehk1E-jOLZghyGHJHDucFIjGAe9rOLAKbVEJP...,"""It would be deeply ironic for [neoliberal] ad...",
2,b'gAAAAABehk1EcNemYBpM20RU3eS2YS3nCvLRzBHd2bGT...,If only we could engineer similar hysteria ove...,
3,b'gAAAAABehk1E95z41_KVRXYv43bkK2Ue3dnQzVBSTUxk...,@Carbazas_ They’re recycling the same document...,
4,b'gAAAAABehk1E1rHmsEzc3XkCrES7pUQky07uqYBhpjzh...,Time to start taking a daily multivitamin for ...,


## Combining the xlsx-files into one

In [12]:
#Combines all the files together:
combined = lb1.append([lb2, lb3, lb4, lb5, lb6, lb7, lb8])
combined.reset_index(inplace=True)
combined = combined.drop(columns=['index'])

In [13]:
combined

Unnamed: 0,id,text,label
0,b'gAAAAABehk1EcROeAH5VCAj58Uk7zRY-VLEkgNyZOS6l...,@szegfu_ 😍😍😍 I bet I love landscapes like thes...,
1,b'gAAAAABehk1EA0dIPQ8Y5jenaZA19qEQ61fU892u6Pm3...,Nominations for the Environmental Leadership A...,
2,b'gAAAAABehk1E00k3-d0QrYaJ6NP3rdqH22frBaT8u6Jj...,How can this be a win???\nTrump's EPA chief cl...,
3,b'gAAAAABehk1E1t9KZA3VWqZWzU5dyOiGON0v8AAirIAz...,I'm starting to think when the worse effects o...,
4,b'gAAAAABehk1EiBelsO_d3bCm21IrpVCETCCAGZ9pfYkk...,@RahulGandhi My voice may not be heard but you...,
...,...,...,...
1555,b'gAAAAABehk1EV_SMbiuiYvf_eKche0NKkU_5LSA2Qdtf...,Small collection of animal and bird photograph...,
1556,b'gAAAAABehk1EhH5E5qZinExeg8NLmErM02WJxSnFoyE0...,You won’t read a truer statement on Twitter to...,
1557,b'gAAAAABehk1Eahli8Lj9cEUAOFOyZvnabed1NQobhVnp...,WATCH: The novel coronavirus is primarily pass...,
1558,b'gAAAAABehk1ET-9Atsjd_O1uVYroun2GXHLRVyD_KvK2...,Here's the thing: I do not find what I am rese...,


In [17]:
label_list = []
for i in range(s):

    label1 = combined.iloc[i, 2]
    label2 = combined.iloc[i + s, 2] 
    label3 = combined.iloc[i + s*2, 2]
    
    if label1 == label2:
        label = label1
        label_list.append(label)
    elif label2 == label3:
        label = label2
        label_list.append(label)
    elif label3 == label1:
        label = label3
        label_list.append(label)
    else:
        label = 'xxx'
        label_list.append(label)
        print(combined.iloc[i, 2])

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [21]:
print(label_list)

['xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx'

In [22]:
orig_set = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Original_set.xlsx')
final_set = np.c_[orig_set, label_list]
final_set = pd.DataFrame(final_set, columns=['id', 'text', 'label'])

In [23]:
final_set

Unnamed: 0,id,text,label
0,1245612731593441024,@szegfu_ 😍😍😍 I bet I love landscapes like thes...,xxx
1,1245425774481616896,Nominations for the Environmental Leadership A...,xxx
2,1245339442207804928,How can this be a win???\nTrump's EPA chief cl...,xxx
3,1245560235458096896,I'm starting to think when the worse effects o...,xxx
4,1245439624089985024,@RahulGandhi My voice may not be heard but you...,xxx
...,...,...,...
515,1245328820766274048,"""Trump has a kind of grip on the media where t...",xxx
516,1245320066264116992,@ArvindKejriwal @PMOIndia @narendramodi As Loc...,xxx
517,1245374742900678912,"#US one of the biggest donor, has dished out i...",xxx
518,1245292978937253888,Thank you from Austria. Despite and because of...,xxx
