# Topic Modeling using Theses' Abstracts
## 4. Data Preprocessing - Clean Data
Mai Vu<br>
Helsinki Metropolia University of Applied Sciences<br>
Bachelor’s Thesis<br>
May 2021

In [1]:
#Basic libraries
import pandas as pd
import numpy as np
import pickle

In [2]:
df = pd.read_csv('Theseus_data_19.5.2021_FINAL.csv', sep = '|')
df.sort_values(['year'], inplace = True)
print('Data length:', len(df))

#Calculate the length as the total words of English and Finnish abstracts
df['len'] = df['en'].str.split().str.len() + df['fi'].str.split().str.len() 

Data length: 176738


In [3]:
#Clean data
#original_language
df.loc[df['original_language'].str.contains('fi', na = False), 'original_language'] = 'fi'
df.loc[df['original_language'].str.contains('en', na = False), 'original_language'] = 'en'
df.loc[df['original_language'].str.contains('swe', na = False), 'original_language'] = 'sv'
df.loc[df['original_language'].isnull(), 'original_language'] = np.NaN
df.loc[~(df['original_language'].str.contains('fi', na = False) |
         df['original_language'].str.contains('en', na = False) |
         df['original_language'].str.contains('sv', na = False) |
         df['original_language'].isnull()) , 'original_language'] = 'other'

#organization
school_names = ['Centria', 'Metropolia', 'Laurea', 'Arcada', 'Häme', 'Karelia', 'Novia', 'Åland', 'Haaga-Helia',
                'Vaasa', 'Satakunta', 'Saimaa', 'Tampere', 'Turku', 'Jyväskylä', 'Kymenlaakso', 'Mikkeli', 'Savonia',
                'Kajaani', 'South-Eastern Finland', 'Lapland', 'LAB']

school_names_2 = {'Satakunnan' : 'Satakunta University of Applied Sciences',
                  'Lapin': 'Lapland University of Applied Sciences',
                  'Turun': 'Turku University of Applied Sciences',
                  'Diakonia': 'Diaconia University of Applied Sciences',
                  'Daikonia': 'Diaconia University of Applied Sciences',
                  'Lahden': 'Lahti University of Applied Sciences',
                  'Seinäjoen': 'Seinäjoki University of Applied Sciences',
                  'Keski-Pohjanmaan': 'Centria University of Applied Sciences',
                  'Humanistinen': 'Humak University of Applied Sciences',
                  'Pohjois-Karjalan': 'Karelia University of Applied Sciences',
                  'Rovaniemen': 'Rovaniemi University of Applied Sciences',
                  'Kaakkois-Suomen': 'South-Eastern Finland University of Applied Sciences',
                  'Poliisiammattikorkeakoulu': 'Police University College',
                  'Pirkanmaan' : 'Pirkanmaa University of Applied Sciences',
                  'Oulu' : 'Oulu University of Applied Sciences',
                  'tekstiili ja kemiantekniikan koulutusohjelma' : np.NaN,
                  'Degree programme' : np.NaN}

df.loc[df['organization'].isnull(), 'organization'] = np.NaN
for school_name in school_names:
    df.loc[df['organization'].str.contains(school_name, na = False, case = False), 'organization'] = school_name + ' University of Applied Sciences'
for school_name in school_names_2:
    df.loc[df['organization'].str.contains(school_name, na = False), 'organization'] = school_names_2[school_name]

In [4]:
#Keep samples with more than 135 words
df2 = df.loc[df['len'] > 135].copy()
df2.drop(columns = ['len'], axis = 1, inplace = True)

In [5]:
print('Final data length:', len(df2))
print('Deleted', len(df) - len(df2), 'samples')

Final data length: 174955
Deleted 1783 samples


In [6]:
#Save datafile pickle
filename = 'final_data'
outfile = open(filename, 'wb')
pickle.dump(df2, outfile, protocol = 3) #Old protocol
outfile.close()

In [7]:
#Test the saved file
infile = open(r'final_data','rb')
temp = pickle.load(infile)
infile.close()

In [8]:
temp.sample(5)

Unnamed: 0,handle,year,original_language,organization,google_translated_en,en,google_translated_fi,fi
96229,10024/109747,2016,fi,Häme University of Applied Sciences,0,The aim of this practice-based Bachelor’s thes...,0,Toiminnallisen opinnäytetyön tarkoituksena oli...
91717,10024/120469,2016,en,Centria University of Applied Sciences,0,This article examined the relationship of poli...,1,Tässä artikkelissa tarkasteltiin Bangladeshin ...
149931,10024/263668,2019,en,Turku University of Applied Sciences,0,"Open source software is growing, and widely us...",0,Avoimen lähdekoodin ohjelmiston määrä ja käytt...
104359,10024/107083,2016,en,Haaga-Helia University of Applied Sciences,0,Volunteer tourism is a fast growing trend and ...,1,Vapaaehtoistyö on nopeasti kasvava trendi ja k...
119707,10024/139813,2017,fi,Jyväskylä University of Applied Sciences,1,The city of Nivala spends 0.5-4M euros annuall...,0,Nivalan kaupunki käyttää korjausrakentamiseen ...
