# Load files from data

load all the articles from local storage (new_data & data) and prepare the tables for the subsequent analysis.

---

### Loading packages

In [1]:
import pandas as pd
import numpy as np
import glob
import re
import string
import datetime

## Loading data

The for-loop gathers all the .csv files within the data directory, concats and labels them and then returns a data frame.

In [2]:
category = {
    1: 'politik', 2: 'wirtschaft', 3: 'finanzen', 4: 'feuilleton', 5: 'sport', 6: 'gesellschaft', 7: 'stil', 
    8: 'technik-motor', 9: 'wissen', 10: 'reise', 11: 'beruf-chance', 12: 'aktuell'
}

In [3]:
def build_tables(category):
    raw_articles = []
    if category is 'aktuell':
        path = f"../new_data/aktuell/"
    else:
        path = f"../new_data/{category}/"
        
    all_files = glob.glob(path + '*.csv')

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        raw_articles.append(df)
    faz_articles = pd.concat(raw_articles, axis=0, ignore_index=True)
    faz_articles['label'] = category

    return faz_articles

---
---
# Generate Test and Train Data

**This for-loop goes through the data frames built in the first step and then collects the words from each entry in a list.**

## Train Data:

In [4]:
frames = []

for key, value in category.items():
    raw_faz = build_tables(value)
    frames.append(raw_faz)
    faz_train = pd.concat(frames, axis=0, ignore_index=True)

In [5]:
faz_train = faz_train.drop_duplicates()

In [6]:
faz_train.groupby('label').count()

Unnamed: 0_level_0,link,published,title,detailed
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aktuell,748,748,748,748
beruf-chance,21,21,21,21
feuilleton,113,113,113,113
finanzen,77,77,77,77
gesellschaft,144,144,144,144
politik,263,263,263,263
reise,11,11,11,11
sport,240,240,240,240
stil,27,27,27,27
technik-motor,33,33,33,33


# Split train data in 'politik' and 'rest':

This step is for the sake of labelling the subsequant data and to get an idea of the data distribution (**'Politics' make up about 17% of the train data**):

In [7]:
faz_train.label.nunique()

12

In [8]:
faz_pol = faz_train[faz_train.label == 'politik']
faz_sport = faz_train[faz_train.label == 'sport']
faz_eco = faz_train[faz_train.label == 'wirtschaft']

In [9]:
faz_train_2 = faz_train.copy()

In [10]:
faz_train_2 = faz_train_2[faz_train_2.label != 'politik']
faz_train_2 = faz_train_2[faz_train_2.label != 'sport']
faz_train_2 = faz_train_2[faz_train_2.label != 'wirtschaft']
faz_rem = faz_train_2

In [11]:
faz_rem.label.nunique()

9

In [12]:
faz_rem.groupby('label').count()

Unnamed: 0_level_0,link,published,title,detailed
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aktuell,748,748,748,748
beruf-chance,21,21,21,21
feuilleton,113,113,113,113
finanzen,77,77,77,77
gesellschaft,144,144,144,144
reise,11,11,11,11
stil,27,27,27,27
technik-motor,33,33,33,33
wissen,50,50,50,50


---

# Build feature word list:

In [13]:
rgx_singles = re.compile("([\w][\w']*[\w])")
rgx_doubles = re.compile("([\w][\w']*[\w] +[\w][\w']*[\w])")
rgx_triples = re.compile("([\w][\w']*[\w] +[\w][\w']*[\w] +[\w][\w']*[\w])")

translator = str.maketrans('', '', string.punctuation)

In [14]:
words = []

for index, row in faz_train.iterrows():
    line = row['detailed'].translate(translator)
    words += rgx_singles.findall(line)
    words += rgx_doubles.findall(line)
    words += rgx_triples.findall(line)

In [15]:
word_list = pd.DataFrame(words, columns=['word'])

In [16]:
titles = []

for index, row in faz_train.iterrows():
    line = row['title'].translate(translator)
    titles += rgx_singles.findall(line)
    titles += rgx_doubles.findall(line)
    titles += rgx_triples.findall(line)

In [17]:
title_list = pd.DataFrame(titles, columns=['word'])

In [18]:
scraped = [word_list, title_list]
merged = pd.concat(scraped)
merged

Unnamed: 0,word
0,Randolph
1,Alles
2,auf
3,einer
4,Pressekonferenz
5,am
6,26
7,Oktober
8,2018In
9,der


## Remove all words that are in the stopword list:

In [19]:
merged_list = merged['word'].tolist()

In [20]:
stopword = pd.read_csv('/Users/torben/PycharmProjects/toolbox/stopwords/stopwords.csv', index_col=None, header=0)
stopwordupper = pd.read_csv('/Users/torben/PycharmProjects/toolbox/stopwords/stopwordsupper.csv', index_col=None, header=0)

In [21]:
stopwords = stopword['words'].tolist()
stopwordsupper = stopwordupper['words'].tolist()

In [22]:
feature_list = [word for word in merged_list if word not in stopwords]
fin_feat_list = [word for word in feature_list if word not in stopwordsupper]

In [23]:
features = pd.DataFrame(fin_feat_list)

In [24]:
features.columns = ['words']

In [25]:
features['count'] = 1

In [26]:
features = features.groupby('words').count()

In [27]:
features = features.sort_values('count', ascending=False)
features_sort = features.head(400)

In [28]:
feature_list = list(features_sort.index)
feature_list

['Jahre',
 'in der',
 'Liveticker',
 'Trump',
 'Zeit',
 'EU',
 'Nowitzki',
 'Trainer',
 'Assange',
 'Jahren',
 'Brexit',
 'deutschen',
 'Polizei',
 'May',
 'League',
 'für die',
 'Dirk',
 'Deutschland',
 'Berlin',
 'zeigt',
 'Donald',
 'Julian',
 'Menschen',
 'Welt',
 'deutsche',
 'offenbar',
 'Jahr',
 'Spiel',
 'AfD',
 'Bundesliga',
 'Präsident',
 'Bayern',
 'britische',
 'Theresa',
 'London',
 'FC',
 'Netanjahu',
 'Champions',
 'große',
 'Europa',
 'Merkel',
 'Israel',
 'Amerika',
 'an der',
 'Kinder',
 'München',
 'Eintracht',
 'Premierministerin',
 'Frankfurter',
 'spricht',
 'mit dem',
 'Verfolgen',
 'China',
 'im Liveticker',
 'stellt',
 'großen',
 'Frankfurt',
 'Deutsche',
 'Bild',
 'lässt',
 'lange',
 'Euro',
 'Wahl',
 'bei der',
 'Rennen',
 'März',
 'Dortmund',
 'Champions League',
 'Dirk Nowitzki',
 'vergangenen',
 'mit der',
 'Briten',
 'Regierung',
 'besten',
 'Formel',
 'Interview',
 'gewinnt',
 'alte',
 'Mittwoch',
 'auf dem',
 'April',
 'sorgt',
 'Angela',
 'Augsburg',
 

In [29]:
dictionary={}

for i in range(len(feature_list)):
    dictionary[feature_list[i]] = i

---
---

# Iterate over 'politik' buzzwords:

In [30]:
pol_buzzword = []

for index, row in faz_pol.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    pol_buzzword.append(new_line)

In [31]:
df_pol = pd.DataFrame(pol_buzzword, columns=feature_list)

In [32]:
df_pol['goal_val'] = 1

# Iterate over 'sport' buzzwords:

In [33]:
sport_buzzword = []

for index, row in faz_sport.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    sport_buzzword.append(new_line)

In [34]:
df_sport = pd.DataFrame(sport_buzzword, columns=feature_list)

In [35]:
df_sport['goal_val'] = 2

# Iterate over 'wirtschaft' buzzwords:

In [36]:
eco_buzzword = []

for index, row in faz_eco.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    eco_buzzword.append(new_line)

In [37]:
df_eco = pd.DataFrame(eco_buzzword, columns=feature_list)

In [38]:
df_eco['goal_val'] = 3

# Iterate over remaining buzzwords:

In [39]:
rem_buzzword = []

for index, row in faz_rem.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    rem_buzzword.append(new_line)

In [40]:
df_rem = pd.DataFrame(rem_buzzword, columns=feature_list)

In [41]:
df_rem['goal_val'] = 0

In [42]:
tables = [df_pol, df_eco, df_sport, df_rem]
train_data = pd.concat(tables)

---
**saving the train data frames to a .csv file:**

In [43]:
name = 'train_data'

path = "../data_frames/" + name
print("Pathname:", path)

Pathname: ../data_frames/train_data


In [44]:
train_data.to_csv(path, index=False)

# SAME FOR TEST DATA:

In [45]:
def build_test_tables(category):
    raw_article = []
    if category is 'aktuell':
        path = f"../data/aktuell/"
    else:
        path = f"../data/{category}/"
        
    all_files = glob.glob(path + '*.csv')

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        raw_article.append(df)
    faz_article = pd.concat(raw_article, axis=0, ignore_index=True)
    faz_article['label'] = category

    return faz_article

In [46]:
fram = []

for key, value in category.items():
    raw_test = build_test_tables(value)
    fram.append(raw_test)
    faz_test = pd.concat(fram, axis=0, ignore_index=True)

In [47]:
faz_test = faz_test.drop_duplicates()

---

In [48]:
faz_pol_t = faz_test[faz_test.label == 'politik']
faz_sport_t = faz_test[faz_test.label == 'sport']
faz_eco_t = faz_test[faz_test.label == 'wirtschaft']

In [49]:
faz_test_2 = faz_test.copy()

In [50]:
faz_test_2 = faz_test_2[faz_test_2.label != 'politik']
faz_test_2 = faz_test_2[faz_test_2.label != 'sport']
faz_test_2 = faz_test_2[faz_test_2.label != 'wirtschaft']
faz_rem_t = faz_test_2

# pol_t:

In [51]:
pol_buzzword_t = []

for index, row in faz_pol_t.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    pol_buzzword_t.append(new_line)

In [52]:
df_pol_t = pd.DataFrame(pol_buzzword_t, columns=feature_list)

In [53]:
df_pol_t['goal_val'] = 1

# sport_t:

In [54]:
sport_buzzword_t = []

for index, row in faz_sport_t.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    sport_buzzword_t.append(new_line)

In [55]:
df_sport_t = pd.DataFrame(sport_buzzword_t, columns=feature_list)

In [56]:
df_sport_t['goal_val'] = 2

# eco_t:

In [57]:
eco_buzzword_t = []

for index, row in faz_eco_t.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    eco_buzzword_t.append(new_line)

In [58]:
df_eco_t = pd.DataFrame(eco_buzzword_t, columns=feature_list)

In [59]:
df_eco_t['goal_val'] = 3

# rem_t:

In [60]:
rem_buzzword_t = []

for index, row in faz_rem_t.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    rem_buzzword_t.append(new_line)

In [61]:
df_rem_t = pd.DataFrame(rem_buzzword_t, columns=feature_list)

In [62]:
df_rem_t['goal_val'] = 0

In [63]:
test_tables = [df_pol_t, df_sport_t, df_eco_t, df_rem_t]
top_test_data = pd.concat(test_tables)

---
**saving the data frames to a .csv file:**

In [64]:
name = 'test_data'

path = "../data_frames/" + name
print("Pathname:", path)

Pathname: ../data_frames/test_data


In [65]:
top_test_data.to_csv(path, index=False)

---