# Datacleaning
Check data, make it usable.

# Imports

In [1]:
import pandas as pd
import numpy as np

## Load dataset

In [2]:
from glob import glob
from sklearn.model_selection import train_test_split

In [3]:
# Empty list to append different categories
l = []
# Loop over all given csv in folder
for file in glob('raw_data/*.csv'):
    df = pd.read_csv(file)
    # Get category, add column to dataframe
    category = file.split('/')[-1].split("\\")[-1].split('.')[0]
    df['category'] = category
    l.append(df)

# Add all data frames together
df = pd.concat(l, axis=0, ignore_index=True)

#### Check if it worked

In [4]:
df.category.value_counts()

sports           156899
politics          87157
world             60297
entertainment     50282
travel            49470
financial         47851
technology        41476
Name: category, dtype: int64

## Check out Columns
Remove useless ones!

In [5]:
print(*df.columns.values)

organizations uuid thread author url ord_in_thread title locations entities highlightText language persons text external_links published crawled highlightTitle category


### organizations
Could be useful for eda, __check later__!

In [6]:
df.organizations.unique()
df.drop('organizations', axis=1, inplace=True)

### thread
Find useful information in thread!

In [7]:
import json
# turn str to dict
count=0
l = []
for s in df.thread.values:
    s = s.replace("\'", '\"') # replace single quotes to double quotes because json does not support single quotes
    try: # Try to load as json, some values are wrong
        l.append(json.loads(s))
    except:
        l.append({'false': False})
        count += 1
print('{} entries could not be loaded!'.format(count))
        
df.thread = l

97132 entries could not be loaded!


__Remove__ entries that could not be converted!

In [8]:
df = df[['false' not in i for i in  [i.keys() for i in df.thread.values]]]

In [9]:
df.thread.values[0].keys()

dict_keys(['social', 'site_full', 'main_image', 'site_section', 'section_title', 'url', 'country', 'title', 'performance_score', 'site', 'participants_count', 'title_full', 'spam_score', 'site_type', 'published', 'replies_count', 'uuid'])

In [10]:
sum([i['social']['pinterest']['shares'] for i in df.thread.values if 'social' in i.keys()])

0

Social seems to be __zero__ for __every value__!

In [11]:
df.thread.values[1]['site']

'newsdump.com'

In [12]:
df['site'] = [i['site'] for i in df.thread.values]

In [13]:
df.site.value_counts()

newsdump.com              71998
wn.com                    20460
yahoo.com                 15868
cbs8.com                  10907
reuters.com                8372
                          ...  
ipsnews.net                   1
spacewar.com                  1
capsta.co.uk                  1
scientificamerican.com        1
hindawi.com                   1
Name: site, Length: 1868, dtype: int64

Website could be __useful__ for __eda__!

In [14]:
df.thread.values[1]['country']

'CZ'

In [15]:
df['country'] = [i['country'] for i in df.thread.values]

In [16]:
df.country.value_counts()

US    239304
CZ     72003
GB     34176
AU     11576
IL      9751
        7817
CA      4841
IE      4495
IN      2702
SG      2129
EU      2016
CH       901
JE       887
EG       706
MY       676
IT       471
ZA       417
TH       327
AE       267
ID       211
NZ       199
FR       130
HK        99
JP        43
CR        39
PA        37
DE        24
GR        16
TR        15
VG        10
VN         9
RS         4
RU         1
BE         1
Name: country, dtype: int64

__Country__ is useful for eda, __include__ as new __column__!

In [17]:
set([i['site_type'] for i in df.thread.values])

{'news'}

Site type only has one value, so no need to use it!

__Remove__ feature __thread__ afterwards!

In [18]:
df.drop('thread', axis=1, inplace=True)

### title
Looks like it can be directly used like this!

In [19]:
df.title.values[0]

'The Healthiest Pastas: From Quinoa to Buckwheat Noodles'

In [20]:
for i in sorted(set(str(df.title.values))):
    print(repr(i), end=' ')

'\n' ' ' "'" ',' '-' '.' '0' '1' '2' '4' ':' 'A' 'B' 'C' 'D' 'F' 'H' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'V' 'W' 'Y' '[' ']' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'k' 'l' 'm' 'n' 'o' 'p' 'r' 's' 't' 'u' 'v' 'w' 'y' '£' 

### locations
Could be __useful__ for later __eda__!

In [21]:
df.locations.values[1]
df.drop('locations', axis=1, inplace=True)

### text

In [22]:
# Remove tags at end of article
df.text = [i.split('\nTAGS')[0] for i in df.text.values]
# Remove Copyright at end of article
df.text = [i.split('\nCopyright')[0] for i in df.text.values]
# Remove Copyright as sign
df.text = [i.split('\n©')[0] for i in df.text.values]
# Remove texts consisting only of *** *** ***
df = df[[False if '*** ***' in text else True for text in df.text]]

In [23]:
# Remove texts with strange multiple '...' because only 10000 and we are too  lazy to do more analysis
df['num_triple_dots'] = [len(i.split('...')) for i in df.text.values]
df = df.query('num_triple_dots < 4')
df.drop('num_triple_dots', axis=1, inplace=True)

In [24]:
# Include feature text length
df['text_length'] = [len(i) for i in df.text.values]

In [25]:
# Remove texts that are too long or too small
df = df.query('text_length > 800 & text_length < 5000')
df.shape[0]

187982

In [26]:
for i in sorted(set(str(df.text.values))):
    print(repr(i), end=' ')

'\n' ' ' '"' '$' '%' '&' "'" '(' ')' ',' '-' '.' '/' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'R' 'S' 'T' 'U' 'V' 'W' 'Y' 'Z' '[' '\\' ']' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '–' '—' '’' '“' '”' 

### published
__Useful__ for __eda__ later!

In [27]:
df.published.values[0]

'2015-10-02T03:00:00.000+03:00'

## Unnecessary columns
__Remove__ uuid, author, url, ord_in_thread, entities, highlightText, highlightTitle, language, persons, external_links, crawled.\
These columns have no use for our purpose!

In [28]:
to_remove = ['uuid', 'author', 'url', 'ord_in_thread', 'entities', 'highlightText', 'highlightTitle', 'language', 'persons', 'external_links', 'crawled']
df.drop(to_remove, axis=1, inplace=True)

## Look at dataset

In [29]:
df.category.value_counts()

sports           50592
financial        30173
world            27425
travel           26266
technology       20972
entertainment    20028
politics         12526
Name: category, dtype: int64

In [30]:
print('#### News data set ####')
print('# of observations: {}'.format(df.shape[0]))
print('# of features:          {}'.format(df.shape[1]))

#### News data set ####
# of observations: 187982
# of features:          7


In [31]:
df.isna().sum()

title          4
text           0
published      0
category       0
site           0
country        0
text_length    0
dtype: int64

__Remove__ nans, only 5!

In [32]:
df.dropna(inplace=True)

## Create Test set
With xxx values per category.

In [33]:
xxx = 500
# Split dataset, each category with same sample size
TEST = []
TRAIN = []
for i in df.category.unique():
    tmp = df[df['category'] == i]
    train, test = train_test_split(tmp, shuffle=True, test_size=(xxx/(tmp.shape[0]+0.001)), random_state=42)
    TEST.append(test)
    TRAIN.append(train)
df_test = pd.concat(TEST, axis=0, ignore_index=True)
df = pd.concat(TRAIN, axis=0, ignore_index=True)

In [34]:
df_test.category.value_counts()

politics         500
technology       500
entertainment    500
travel           500
financial        500
sports           500
world            500
Name: category, dtype: int64

## Create train dataset
Use about 20000 per category.

In [35]:
#xxx = df.category.value_counts(ascending=True)[0] # number of samples per category, take all if None
xxx = 20000

In [36]:
if xxx != None:
    # Split dataset, each category with same sample size
    l = []
    for i in df.category.unique():
        tmp = df[df['category'] == i]
        test_size = (1-xxx/(tmp.shape[0]+0.001))
        if test_size > 0:
            tmp, _ = train_test_split(tmp, shuffle=True, test_size=test_size, random_state=42)
        l.append(tmp)
    df = pd.concat(l, axis=0, ignore_index=True)

In [37]:
df.category.value_counts()

world            19999
sports           19999
technology       19999
financial        19999
travel           19999
entertainment    19528
politics         12026
Name: category, dtype: int64

### Add Lemmatization

In [38]:
from time import time
# Add functions path
import sys
sys.path.append('../../Functions')
from text_lemmatization import Lemmatizer
lemmatizer = Lemmatizer()
start = time()
df['text_lem'] = lemmatizer.lem_list(df.text)
df_test['text_lem'] = lemmatizer.lem_list(df_test.text)
print('finished in {:.2f}s'.format(time()-start))
# Took about 55 min

finished in 3986.39s


In [39]:
# Save dataframes
df.to_csv('dataset_categories_train.csv', index=False)
df_test.to_csv('dataset_categories_test.csv', index=False)