# Load files from data

**load all the articles from local storage (new_data & data) and prepare the tables for the subsequent analysis. ALternatively - if implemented - use boto3 to load data from S3 storage on AWS:**

---

### Loading packages

In [1]:
import pandas as pd
import numpy as np
import glob
import re
import string
#import boto3

## Loading data

The for-loop gathers all the .csv files within the data directory, concats and labels them and then returns a data frame.

In [2]:
category = {
    1: 'politik', 2: 'wirtschaft', 3: 'finanzen', 4: 'feuilleton', 5: 'sport', 6: 'gesellschaft', 7: 'stil', 
    8: 'technik-motor', 9: 'wissen', 10: 'reise', 11: 'beruf-chance', 12: 'aktuell'
}

In [3]:
def build_tables(category):
    raw_articles = []
    if category is 'aktuell':
        path = f"../new_data/aktuell/"
    else:
        path = f"../new_data/{category}/"
        
    all_files = glob.glob(path + '*.csv')

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        raw_articles.append(df)
    faz_articles = pd.concat(raw_articles, axis=0, ignore_index=True)
    faz_articles['label'] = category

    return faz_articles

---
# Generate Test and Train Data

**This for-loop goes through the data frames built in the first step and then collects the words from each entry in a list.**

## Train Data:

In [4]:
frames = []

for key, value in category.items():
    raw_faz = build_tables(value)
    frames.append(raw_faz)
    faz_train = pd.concat(frames, axis=0, ignore_index=True)

In [5]:
faz_train = faz_train.drop_duplicates()

In [6]:
faz_train.groupby('label').count()

Unnamed: 0_level_0,link,published,title,detailed
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aktuell,441,441,441,441
beruf-chance,14,14,14,14
feuilleton,69,69,69,69
finanzen,45,45,45,45
gesellschaft,79,79,79,79
politik,170,170,170,170
reise,9,9,9,9
sport,146,146,146,146
stil,18,18,18,18
technik-motor,24,24,24,24


# Split train data in 'sport' and 'rest':

This step is for the sake of labelling the subsequant data and to get an idea of the data distribution (**'Politics' make up about 17% of the train data**):

In [7]:
faz_rest = faz_train[faz_train.label != 'sport']
faz_rest.count()

link         990
published    990
title        990
detailed     990
label        990
dtype: int64

In [8]:
faz_sport = faz_train[faz_train.label == 'sport']
faz_sport.count()

link         146
published    146
title        146
detailed     146
label        146
dtype: int64

In [9]:
170/966

0.17598343685300208

---

## Build feature word list:

In [10]:
rgx_singles = re.compile("([\w][\w']*[\w])")
rgx_doubles = re.compile("([\w][\w']*[\w] +[\w][\w']*[\w])")
rgx_triples = re.compile("([\w][\w']*[\w] +[\w][\w']*[\w] +[\w][\w']*[\w])")

translator = str.maketrans('', '', string.punctuation)

In [11]:
words = []

for index, row in faz_train.iterrows():
    line = row['detailed'].translate(translator) 
    words += rgx_singles.findall(line)
    words += rgx_doubles.findall(line)
    words += rgx_triples.findall(line)

In [12]:
word_list = pd.DataFrame(words, columns=['word'])

In [13]:
titles = []

for index, row in faz_train.iterrows():
    line = row['title'].translate(translator) 
    titles += rgx_singles.findall(line)
    titles += rgx_doubles.findall(line)
    titles += rgx_triples.findall(line)

In [14]:
title_list = pd.DataFrame(titles, columns=['word'])

In [15]:
scraped = [word_list, title_list]
merged = pd.concat(scraped)

In [16]:
merged['count'] = 1

In [17]:
features = merged.groupby('word').count()

In [18]:
features = features.sort_values('count', ascending=False)
feature_list = features.head(400)
feature_list = list(feature_list.index)

In [19]:
feature_list

['der',
 'in',
 'die',
 'und',
 'im',
 'den',
 'für',
 'von',
 'auf',
 'das',
 'mit',
 'Die',
 'ist',
 'dem',
 'sich',
 'eine',
 'ein',
 'nicht',
 'des',
 'aus',
 'nach',
 'an',
 'gegen',
 'zu',
 'hat',
 'als',
 'es',
 'Der',
 'vor',
 'am',
 'bei',
 'Das',
 'einem',
 'einer',
 'noch',
 'einen',
 'will',
 'über',
 'um',
 'auch',
 'er',
 'sie',
 'wird',
 'Ein',
 'sind',
 'mehr',
 'zum',
 'werden',
 'haben',
 'wie',
 'Doch',
 'nur',
 'zur',
 'muss',
 'in der',
 'war',
 'beim',
 'Sie',
 'seine',
 'sein',
 'gibt',
 'In',
 'aber',
 'soll',
 'seiner',
 'nun',
 'dass',
 'schon',
 'Nowitzki',
 'Bayern',
 'Trainer',
 'Jahre',
 'was',
 'wenn',
 'Trump',
 'ihre',
 'Berlin',
 'ab',
 'kann',
 'geht',
 'so',
 'Eine',
 'wieder',
 'unter',
 'Wie',
 'Brexit',
 'neuen',
 'immer',
 'gut',
 'viel',
 'keine',
 'vom',
 'Jahren',
 'Dirk',
 'ins',
 'Liveticker',
 'Netanjahu',
 'FC',
 'eines',
 'Bundesliga',
 'Es',
 'neue',
 'Nun',
 'deutschen',
 'Im',
 'Präsident',
 'Deutschland',
 'Weg',
 'wegen',
 'ihren',
 

In [20]:
dictionary={}

for i in range(len(feature_list)):
    dictionary[feature_list[i]] = i

# Iterate over 'sport' buzzwords:

In [21]:
sport_buzzword = []

for index, row in faz_sport.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    sport_buzzword.append(new_line)

In [22]:
df1 = pd.DataFrame(sport_buzzword, columns=feature_list)

In [23]:
df1['goal_val'] = 1

# Iterate over remaining buzzwords:

In [24]:
buzzwords = []

for index, row in faz_rest.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    buzzwords.append(new_line)

In [25]:
df2 = pd.DataFrame(buzzwords, columns=feature_list)

In [26]:
df2['goal_val'] = 0

In [27]:
train_tables = [df1, df2]
top_train_data = pd.concat(train_tables)

---
**saving the train data frames to a .csv file:**

In [28]:
name = 'train_data'

path = "../data_frames/sport/" + name
print("Pathname:", path)

Pathname: ../data_frames/sport/train_data


In [29]:
top_train_data.to_csv(path, index=False)

# SAME FOR TEST DATA:

In [30]:
def build_test_tables(category):
    raw_article = []
    if category is 'aktuell':
        path = f"../data/aktuell/"
    else:
        path = f"../data/{category}/"
        
    all_files = glob.glob(path + '*.csv')

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        raw_article.append(df)
    faz_article = pd.concat(raw_article, axis=0, ignore_index=True)
    faz_article['label'] = category

    return faz_article

In [31]:
fram = []

for key, value in category.items():
    raw = build_test_tables(value)
    fram.append(raw)
    faz_test = pd.concat(fram, axis=0, ignore_index=True)

In [32]:
faz_test = faz_test.drop_duplicates()

In [33]:
faz_test

Unnamed: 0,link,published,title,detailed,label
0,https://www.faz.net/aktuell/politik/trumps-pra...,"Tue, 02 Apr 2019 14:16:22 +0200",Joe Biden: Vom Hoffnungsträger zum Problem?,Für viele Amerikaner ein vertrauter Politiker:...,politik
1,https://www.faz.net/aktuell/brexit/nach-brexit...,"Tue, 02 Apr 2019 07:46:28 +0200",Brexit: Das Ende naht – tut es das?,Eine Brexit-Anhängerin vor dem Parlament in Lo...,politik
2,https://www.faz.net/aktuell/politik/ausland/eu...,"Tue, 02 Apr 2019 12:30:17 +0200",EU: Risiko eines Chaos-Brexits wächst täglich,Michel BarnierWieder hat das britische Unterha...,politik
3,https://www.faz.net/aktuell/politik/ausland/ve...,"Tue, 02 Apr 2019 03:57:03 +0200",Venezuelas Oppositionschef Guaidó soll die Imm...,Juan Guaido (hier am Montag in der Universität...,politik
4,https://www.faz.net/aktuell/politik/alle-alter...,"Tue, 02 Apr 2019 00:10:13 +0200",Alle Alternativen zu Mays Brexit-Vertrag schei...,Am britischen Parlament scheint eine Projektio...,politik
5,https://www.faz.net/aktuell/politik/ausland/na...,"Mon, 01 Apr 2019 21:11:30 +0200",Nach der Wahl in der Türkei: Risse im System E...,Flagge zeigen: Die Kommunalwahl ist ein wichti...,politik
6,https://www.faz.net/aktuell/politik/ausland/sc...,"Mon, 01 Apr 2019 16:38:02 +0200",Interview mit Schwedens Ministerpräsident,Der schwedische Ministerpräsident Stefan Löfve...,politik
7,https://www.faz.net/aktuell/politik/ausland/sp...,"Mon, 01 Apr 2019 19:51:45 +0200",„Angriff auf die Republik“: Sprengsätze kurz v...,Donnerstag reist er nach Korsika: Emmanuel Mac...,politik
8,https://www.faz.net/aktuell/politik/inland/meh...,"Mon, 01 Apr 2019 18:30:00 +0200",Mehrheit der Deutschen lehnt gendergerechte Sp...,Wort mit Sternchen: Wird gendergerechte Sprach...,politik
9,https://www.faz.net/aktuell/politik/inland/wah...,"Wed, 03 Apr 2019 15:36:27 +0200",Wahlreform: Verkleinerung des Bundestags vorer...,Besucher in der Kuppel des Reichstagsgebäudes ...,politik


---

In [34]:
test_sport = faz_test[faz_test.label == 'sport']
test_sport.count()

link         36
published    36
title        36
detailed     36
label        36
dtype: int64

In [35]:
test_rest = faz_test[faz_test.label != 'sport']
test_rest.count()

link         433
published    433
title        433
detailed     433
label        433
dtype: int64

In [36]:
66/403

0.16377171215880892

In [37]:
test_sport_buzzword = []

for index, row in test_sport.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    test_sport_buzzword.append(new_line)

In [38]:
df3 = pd.DataFrame(test_sport_buzzword, columns=feature_list)

In [39]:
df3['goal_val'] = 1

In [40]:
test_rest_buzzword = []

for index, row in test_rest.iterrows():
    new_line = np.zeros(400)
    words = list(row['detailed'].split(' '))
    for word in words:
        if word in dictionary:
            new_line[dictionary[word]] += 1
    test_rest_buzzword.append(new_line)

In [41]:
df4 = pd.DataFrame(test_rest_buzzword, columns=feature_list)

In [42]:
df4['goal_val'] = 0

In [43]:
test_tables = [df3, df4]
top_test_data = pd.concat(test_tables)

---
**saving the data frames to a .csv file:**

In [44]:
name = 'test_data'

path = "../data_frames/sport/" + name
print("Pathname:", path)

Pathname: ../data_frames/sport/test_data


In [45]:
top_test_data.to_csv(path, index=False)

---