# Splitting of tweets into xlsx-files

This notebook is to divide 520 tweets to eight xlsx-files, so that every tweet will be labelled by three persons. This means that each member of the group will get a excel-file that has 195 tweets.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import re

## Test and train sets

In [None]:
#reads data from a file
data_filepath = "/data"


data_arr = []
column_names = ['id','text']
with open(data_filepath, 'r') as f:
    for tweet in f:
        selected_row = []
        json_tweet = json.loads(tweet)
        try:
            # filter out retweets and non-English tweets
            if not json_tweet['retweeted'] and 'RT @' not in json_tweet['text'] and json_tweet['lang'] == 'en':
                for col in column_names:
                    selected_row.append(json_tweet[col])
                data_arr.append(selected_row)
        except KeyError:
            pass
    data = pd.DataFrame(data_arr, columns=column_names)

In [None]:
data.head()

In [None]:
#Each 8 members labels 195 tweets.
s = 520 #sample size
n = 3 #the amount that each tweet is labelled
p = 8 #number of people

In [None]:
#Collects 520 random tweets for labelling from the final data
data_set, label_set = train_test_split(data, test_size = s)

In [None]:
print(label_set)

In [None]:
# remove URLs
def preprocess(item):
    item = re.sub(r'http\S+', '', item)
    return item

In [None]:
preprocessed_tweets = label_set['text'].apply(preprocess)

In [None]:
preprocessed_tweets.head()

In [None]:
label_df = np.c_[label_set['id'], preprocessed_tweets]
label_df = pd.DataFrame(label_df, columns=column_names)

In [None]:
label_df

## Dividing the data into xlsx-files

In [None]:
#Creates a dataframe that consist of s*n amounts of tweets
add_df = label_df
new_df = label_df
for i in range(n-1):
    new_df = new_df.append(add_df)

#adds extra column for labelling
new_df["label"] = ""

new_df

In [None]:
#Splits the new_df into number of p dataframes:
df_split = np.array_split(label_df, p)

df1 = df_split[0].append([df_split[1], df_split[2]])
df2 = df_split[1].append([df_split[2], df_split[3]])
df3 = df_split[2].append([df_split[3], df_split[4]])
df4 = df_split[3].append([df_split[4], df_split[5]])
df5 = df_split[4].append([df_split[5], df_split[6]])
df6 = df_split[5].append([df_split[6], df_split[7]])
df7 = df_split[6].append([df_split[7], df_split[1]])
df8 = df_split[7].append([df_split[1], df_split[2]])

In [None]:
df1

In [None]:
df1.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Inka.xlsx', index = False)
df2.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maria.xlsx', index = False)
df3.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Ville.xlsx', index = False)
df4.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maryam.xlsx', index = False)
df5.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Annika.xlsx', index = False)
df6.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Veera.xlsx', index = False)
df7.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Nuutti.xlsx', index = False)
df8.to_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Estanislao.xlsx', index = False)

In [None]:
#Let's read the exels
lb1 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Inka.xlsx')
lb2 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maria.xlsx')
lb3 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Ville.xlsx')
lb4 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Maryam.xlsx')
lb5 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Annika.xlsx')
lb6 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Veera.xlsx')
lb7 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Nuutti.xlsx')
lb8 = pd.read_excel(r'C:\Users\Inka\Desktop\Koulu\Project course\Labelling\Labelling_Estanislao.xlsx')

In [None]:
lb2.head()

## Combining the xlsx-files into one

In [None]:
#Combines all the files together:
combined = lb1.append([lb2, lb3, lb4, lb5])
combined.reset_index(inplace=True)

In [None]:
label_list = []
for i in range(s):

    label1 = combined.iloc[i, 3]
    label2 = combined.iloc[i + s, 3] 
    label3 = combined.iloc[i + s*2, 3]
    
    if label1 == label2:
        label = label1
        label_list.append(label)
    elif label2 == label3:
        label = label2
        label_list.append(label)
    elif label3 == label1:
        label = label3
        label_list.append(label)
    else:
        label = 'xxx'
        label_list.append(label)
        print(combined.iloc[i, 2])

In [None]:
print(label_list)

In [None]:
final_set = np.c_[label_set, label_list]
final_set = pd.DataFrame(final_set, columns=['id', 'text', 'label'])

In [None]:
final_set