## Feature Engineering

In [3]:
# importing libraries

import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

##### First of all we'll load the dataset:

In [4]:
path_df = r"Y:\Masters_Content\Feature_Engineering\Lab1\Codes\News_dataset.pickle"

with open(path_df, 'rb') as data:
    df = pickle.load(data)

In [5]:
df

Unnamed: 0,File_Name,Content,Category,id,News_length
0,The Tempe Police Department said it was invest...,We continue to assist investigators in any way...,autonomous car,1,799.0
1,"On Sunday, the inevitable happened: An autonom...",Cars don’t see wellAutonomous cars don’t track...,autonomous car,1,743.0
2,Even before a driverless Uber vehicle struck a...,"The accident in Tempe, Arizona, was believed t...",autonomous car,1,587.0
3,"On Sunday night, a woman died after she was hi...","On Sunday night, a woman died after she was hi...",autonomous car,1,497.0
4,A self-driving vehicle made by Uber has struck...,Something unexpectedly entering the vehicle’s ...,autonomous car,1,728.0
...,...,...,...,...,...
2094,507.txt,Big guns ease through in San Jose\r\n\r\nTop-s...,sport,1,1376.0
2095,508.txt,Almagro continues Spanish surge\r\n\r\nUnseede...,sport,1,779.0
2096,509.txt,Melzer shocks Agassi in San Jose\r\n\r\nSecond...,sport,1,1154.0
2097,510.txt,Mirza makes Indian tennis history\r\n\r\nTeena...,sport,1,1779.0


##### And visualize one sample news content:

In [6]:
df.loc[1]['Content']



## 1. Text cleaning and preparation

### 1.1. Special character cleaning

##### We can see the following special characters:

##### \r
##### \n
##### \ before possessive pronouns (government's = government\'s)
##### \ before possessive pronouns 2 (Yukos' = Yukos\')
##### " when quoting text

In [7]:
# \r and \n
df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")

In [8]:
text = "Mr Greenspan\'s"
text

"Mr Greenspan's"

In [9]:
# " when quoting text
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')

### 1.2. Upcase/downcase

##### We'll downcase the texts because we want, for example, Football and football to be the same word.

In [10]:
# Lowercasing the text
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()

### 1.3. Punctuation signs

##### Punctuation signs won't have any predicting power, so we'll just get rid of them.

In [11]:
punctuation_signs = list("?:!.,;")
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')

### 1.4. Possessive pronouns

##### We'll also remove possessive pronoun terminations:

In [12]:
df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")

### 1.5. Stemming and Lemmatization

##### Since stemming can produce output words that don't exist, we'll only use a lemmatization process at this moment. Lemmatization takes into consideration the morphological analysis of the words and returns words that do exist, so it will be more useful for us.

In [13]:
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

------------------------------------------------------------


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yaswa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yaswa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [22]:
df = df.dropna().reset_index(drop = True)

##### In order to lemmatize, we have to iterate through every word:

In [23]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)


In [24]:
df['Content_Parsed_5'] = lemmatized_text_list

##### Although lemmatization doesn't work perfectly in all cases (as can be seen in the example below), it can be useful.

### 1.6. Stop words

In [25]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yaswa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))

In [27]:
stop_words[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

##### To remove the stop words, we'll handle a regular expression only detecting whole words, as seen in the following example:

In [28]:
example = "me eating a meal"
word = "me"

# The regular expression is:
regex = r"\b" + word + r"\b"  # we need to build it like that to work properly

re.sub(regex, "StopWord", example)

'StopWord eating a meal'

##### We can now loop through all the stop words:

In [29]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

#####  We have some dobule/triple spaces between words because of the replacements. However, it's not a problem because we'll tokenize by the spaces later.

##### As an example, we'll show an original news article and its modifications throughout the process:

In [30]:
df.loc[5]['Content']

'An autonomous Uber car killed a woman in the street in Arizona, police said, in what appears to be the first reported fatal crash involving a self-driving vehicle and a pedestrian in the US.\nThe company said it was pausing its self-driving car operations in Phoenix, Pittsburgh, San Francisco and Toronto.\nPhotograph: APThe self-driving technology is supposed to detect pedestrians, cyclists and others and prevent crashes.\nSimpson said he was unaware of any previous fatal crashes involving an autonomous vehicle and a pedestrian.\nEarlier this year, California regulators approved the testing of self-driving cars on public roads without human drivers monitoring inside.'

##### 1.Special character cleaning

In [31]:
df.loc[5]['Content_Parsed_1']

'An autonomous Uber car killed a woman in the street in Arizona, police said, in what appears to be the first reported fatal crash involving a self-driving vehicle and a pedestrian in the US. The company said it was pausing its self-driving car operations in Phoenix, Pittsburgh, San Francisco and Toronto. Photograph: APThe self-driving technology is supposed to detect pedestrians, cyclists and others and prevent crashes. Simpson said he was unaware of any previous fatal crashes involving an autonomous vehicle and a pedestrian. Earlier this year, California regulators approved the testing of self-driving cars on public roads without human drivers monitoring inside.'

##### 1.Upcase/downcase

In [32]:
df.loc[5]['Content_Parsed_2']

'an autonomous uber car killed a woman in the street in arizona, police said, in what appears to be the first reported fatal crash involving a self-driving vehicle and a pedestrian in the us. the company said it was pausing its self-driving car operations in phoenix, pittsburgh, san francisco and toronto. photograph: apthe self-driving technology is supposed to detect pedestrians, cyclists and others and prevent crashes. simpson said he was unaware of any previous fatal crashes involving an autonomous vehicle and a pedestrian. earlier this year, california regulators approved the testing of self-driving cars on public roads without human drivers monitoring inside.'

##### 1.Punctuation signs

In [33]:
df.loc[5]['Content_Parsed_3']

'an autonomous uber car killed a woman in the street in arizona police said in what appears to be the first reported fatal crash involving a self-driving vehicle and a pedestrian in the us the company said it was pausing its self-driving car operations in phoenix pittsburgh san francisco and toronto photograph apthe self-driving technology is supposed to detect pedestrians cyclists and others and prevent crashes simpson said he was unaware of any previous fatal crashes involving an autonomous vehicle and a pedestrian earlier this year california regulators approved the testing of self-driving cars on public roads without human drivers monitoring inside'

##### 1.Possessive pronouns

In [34]:
df.loc[5]['Content_Parsed_4']

'an autonomous uber car killed a woman in the street in arizona police said in what appears to be the first reported fatal crash involving a self-driving vehicle and a pedestrian in the us the company said it was pausing its self-driving car operations in phoenix pittsburgh san francisco and toronto photograph apthe self-driving technology is supposed to detect pedestrians cyclists and others and prevent crashes simpson said he was unaware of any previous fatal crashes involving an autonomous vehicle and a pedestrian earlier this year california regulators approved the testing of self-driving cars on public roads without human drivers monitoring inside'

##### 1. Stemming and Lemmatization

In [35]:
df.loc[5]['Content_Parsed_5']

'an autonomous uber car kill a woman in the street in arizona police say in what appear to be the first report fatal crash involve a self-driving vehicle and a pedestrian in the us the company say it be pause its self-driving car operations in phoenix pittsburgh san francisco and toronto photograph apthe self-driving technology be suppose to detect pedestrians cyclists and others and prevent crash simpson say he be unaware of any previous fatal crash involve an autonomous vehicle and a pedestrian earlier this year california regulators approve the test of self-driving cars on public roads without human drivers monitor inside'

##### 1.Stop words

In [36]:
df.loc[5]['Content_Parsed_6']

' autonomous uber car kill  woman   street  arizona police say   appear    first report fatal crash involve  self-driving vehicle   pedestrian   us  company say   pause  self-driving car operations  phoenix pittsburgh san francisco  toronto photograph apthe self-driving technology  suppose  detect pedestrians cyclists  others  prevent crash simpson say   unaware   previous fatal crash involve  autonomous vehicle   pedestrian earlier  year california regulators approve  test  self-driving cars  public roads without human drivers monitor inside'

##### Finally, we can delete the intermediate columns:

In [37]:
df.head(1)

Unnamed: 0,File_Name,Content,Category,id,News_length,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4,Content_Parsed_5,Content_Parsed_6
0,The Tempe Police Department said it was invest...,We continue to assist investigators in any way...,autonomous car,1,799.0,We continue to assist investigators in any way...,we continue to assist investigators in any way...,we continue to assist investigators in any way...,we continue to assist investigators in any way...,we continue to assist investigators in any way...,continue assist investigators way ”uber ...


In [38]:
df['Complete_Filename'] = df['File_Name'] + '-' + df['Category']

In [39]:
list_columns = ["File_Name", "Category", "Complete_Filename", "Content", "Content_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})

In [40]:
df.head()

Unnamed: 0,File_Name,Category,Complete_Filename,Content,Content_Parsed
0,The Tempe Police Department said it was invest...,autonomous car,The Tempe Police Department said it was invest...,We continue to assist investigators in any way...,continue assist investigators way ”uber ...
1,"On Sunday, the inevitable happened: An autonom...",autonomous car,"On Sunday, the inevitable happened: An autonom...",Cars don’t see wellAutonomous cars don’t track...,cars ’ see wellautonomous cars ’ track center...
2,Even before a driverless Uber vehicle struck a...,autonomous car,Even before a driverless Uber vehicle struck a...,"The accident in Tempe, Arizona, was believed t...",accident tempe arizona believe first tim...
3,"On Sunday night, a woman died after she was hi...",autonomous car,"On Sunday night, a woman died after she was hi...","On Sunday night, a woman died after she was hi...",sunday night woman die hit self-driving...
4,A self-driving vehicle made by Uber has struck...,autonomous car,A self-driving vehicle made by Uber has struck...,Something unexpectedly entering the vehicle’s ...,something unexpectedly enter vehicle’ path p...


### 2. Label coding

##### We'll create a dictionary with the label codification:

In [51]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'autonomous car': 4
}

In [52]:
# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

### 3. Train - test split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(df['Content_Parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)

##### Since we don't have much observations (only 2.225), we'll choose a test set size of 15% of the full dataset.

### 4. Text representation

In [55]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

##### We have chosen these values as a first approximation. Since the models that we develop later have a very good predictive power, we'll stick to these values. But it has to be mentioned that different combinations could be tried in order to improve even more the accuracy of the models.

In [56]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(1782, 300)
(315, 300)


##### We can use the Chi squared test in order to see what unigrams and bigrams are most correlated with each category:

In [57]:
from sklearn.feature_selection import chi2
import numpy as np

for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'autonomous car' category:
  . Most correlated unigrams:
. vehicles
. self
. cars
. driving
. autonomous
  . Most correlated bigrams:
. year old
. self driving

# 'business' category:
  . Most correlated unigrams:
. oil
. market
. bank
. growth
. firm
  . Most correlated bigrams:
. year old
. self driving

# 'entertainment' category:
  . Most correlated unigrams:
. tv
. music
. star
. award
. film
  . Most correlated bigrams:
. mr blair
. self driving

# 'politics' category:
  . Most correlated unigrams:
. tory
. blair
. election
. party
. labour
  . Most correlated bigrams:
. prime minister
. mr blair

# 'sport' category:
  . Most correlated unigrams:
. champion
. coach
. cup
. match
. game
  . Most correlated bigrams:
. self driving
. year old



In [58]:
bigrams

['last year',
 'tell bbc',
 'mr brown',
 'prime minister',
 'mr blair',
 'self driving',
 'year old']

##### We can see there are only six. This means the unigrams have more correlation with the category than the bigrams, and since we're restricting the number of features to the most representative 300, only a few bigrams are being considered.

##### Let's save the files we'll need in the next steps:

In [59]:
# X_train
with open('Pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('Pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('Pickles/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('Pickles/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('Pickles/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features_train
with open('Pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('Pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('Pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('Pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('Pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)