# Preprocessing

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the Dataset

In [21]:
df = pd.read_csv('scrapped_data_version_1_comp_business_mind.csv')

In [3]:
df.head()

Unnamed: 0,Book_title,book_id,book_price,author_name(s),publisher_name,publication_year,sub_title_book,book_edition,description,book_availablity,date_of_sale,year_month,author_count,common_width,common_heighy,category_verification
0,Software Engineering at Google,209970024,71.99,Titus Winters,O'Reilly Media,2020,Lessons Learned from Programming Over Time,,"Today, software engineers need to know not onl...",True,2020-02-28T00:00:00,Feb 2020,3,97,150,arch
1,The Software Architect Elevator,209997033,71.99,Gregor Hohpe,O'Reilly Media,2020,Redefining the Architect's Role in the Digital...,,As the digital economy changes the rules of th...,True,2020-04-08T00:00:00,Apr 2020,1,97,150,arch
2,Software Architecture Metrics,210567692,71.99,Christian Ciceri,O'Reilly Media,2022,,,Software architecture metrics are key to the m...,True,2022-05-18T00:00:00,May 2022,10,97,150,arch
3,Designing Hexagonal Architecture with Java,211106723,46.99,Davi Vieira,Packt Publishing,2023,Build maintainable and long-lasting applicatio...,,,True,2023-09-29T00:00:00,Sep 2023,1,97,150,arch
4,OpenStack for Architects,96237030,44.99,Michael Solberg,Packt Publishing,2018,Design production-ready private cloud infrastr...,,,True,2018-05-31T00:00:00,May 2018,2,97,150,arch


In [4]:
df = df[['Book_title','sub_title_book','description']]

In [5]:
df.tail()

Unnamed: 0,Book_title,sub_title_book,description
43050,Mindful Metamorphosis,A Compassionate Guide to Inner Transformation,Unlocking your potential has never been easier...
43051,Beautiful Women: A View from the Heart,Stories of Inspiration to Help Mend a Torn World,Beautiful Women: A View from the Heart contain...
43052,Feminine Reclaimed,A Memoir,There comes a point in everyone's life where w...
43053,Celtic Mythology,Tales From the Celtic Pantheon,"CELTIC MYTHOLOGY ""Celtic Mythology: Tales from..."
43054,Discover & Use Your Greatest Superpower,,This book introduces you to an extended identi...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43055 entries, 0 to 43054
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Book_title      43055 non-null  object
 1   sub_title_book  30499 non-null  object
 2   description     38031 non-null  object
dtypes: object(3)
memory usage: 1009.2+ KB


In [7]:
df.describe()

Unnamed: 0,Book_title,sub_title_book,description
count,43055,30499,38031
unique,35443,27406,35948
top,Neural Information Processing,Click here to enter text.,The full text downloaded to your computer With...
freq,71,131,26


In [8]:
df.isnull().mean()*100

Book_title         0.000000
sub_title_book    29.162699
description       11.668796
dtype: float64

In [9]:
df[df['sub_title_book'].isnull()]

Unnamed: 0,Book_title,sub_title_book,description
2,Software Architecture Metrics,,Software architecture metrics are key to the m...
5,Quantum Computing Fundamentals,,ONE-VOLUME INTRODUCTION TO QUANTUM COMPUTING C...
6,Head First Software Architecture,,What will you learn from this book? If you're ...
9,PCI Express System Architecture,,PCI Express System Architecture provides an in...
13,"Pentium Processor System Architecture, Second ...",,A comprehensive description of the Pentium mic...
...,...,...,...
43025,9. 1. 1. Complete Guide to Natural Healing,,"VIE is born of the French Aristocracy, she is ..."
43029,Ancient Art of Dowsing,,This book answers all the questions on dowsing...
43032,Mirrors & Windows,,"What do an astronaut in training, the CIO of T..."
43033,SQ21,,"According to author Cindy Wigglesworth, Spirit..."


In [10]:
df[df['description'].isnull()]

Unnamed: 0,Book_title,sub_title_book,description
3,Designing Hexagonal Architecture with Java,Build maintainable and long-lasting applicatio...,
4,OpenStack for Architects,Design production-ready private cloud infrastr...,
8,Docker Deep Dive,Zero to Docker in a single book,
10,Docker Deep Dive.,Zero to Docker in a Single Book,
14,Linux Administration Cookbook,Insightful recipes to work with system adminis...,
...,...,...,...
40925,The Guru Question,The Perils and Rewards of Choosing a Spiritual...,
40927,Your Deepest Intent,Letters from the Infinite,
40929,Kuan Yin,Accessing the Power of the Divine Feminine,
40937,Awakening into Oneness,The Power of Blessing in the Evolution of Cons...,


if we dont want to drop these records, we can impute them with a random value to retain as much information as possible about the book.

- Later approaches can also be using pre-trained models and paraphrase the data in hand to generate custom sub titles, or descriptions in the same way

In [11]:
df.fillna('UNK', inplace=True) # UNK for unknown

In [12]:
# merging the features to have a final column for corpus
df['merged_TSD'] = df['Book_title'] + ' ' + df['sub_title_book']+ ' ' + df['description']
df.head()

Unnamed: 0,Book_title,sub_title_book,description,merged_TSD
0,Software Engineering at Google,Lessons Learned from Programming Over Time,"Today, software engineers need to know not onl...",Software Engineering at Google Lessons Learned...
1,The Software Architect Elevator,Redefining the Architect's Role in the Digital...,As the digital economy changes the rules of th...,The Software Architect Elevator Redefining the...
2,Software Architecture Metrics,UNK,Software architecture metrics are key to the m...,Software Architecture Metrics UNK Software arc...
3,Designing Hexagonal Architecture with Java,Build maintainable and long-lasting applicatio...,UNK,Designing Hexagonal Architecture with Java Bui...
4,OpenStack for Architects,Design production-ready private cloud infrastr...,UNK,OpenStack for Architects Design production-rea...


## Pre-processing

In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [14]:
# defining functions for each task

def lower_case(text): 
    return text.lower()

def remove_punctuations(text):
    no_punc_text = text.translate(str.maketrans('','',string.punctuation))
    return no_punc_text

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(text):
    stopset = set(stopwords.words('english'))
    return [word for word in text if word not in stopset]

def lemmatize_text(tokens):
    lemmer = WordNetLemmatizer()
    return [lemmer.lemmatize(word) if lemmer.lemmatize(word) is not None else word for word in tokens]

def remove_special_characters(tokens):
    return [re.sub('[^A-Za-z]+', '', word) for word in tokens]

In [15]:
def preprocess_data(text):
    '''
    To Preprocess the data in one execution.
    
    follows all the disciplines of textual preprocessing in NLP
    '''
    
    text = lower_case(text) # lower the text
    clean_text = remove_punctuations(text) # clean the text 
    tokens = tokenize(text) # tokenization
    clean_tokens = remove_stopwords(tokens) # stopwords removal
    lemmas = lemmatize_text(clean_tokens) # lemmatization 
    final = remove_special_characters(lemmas) # removing special characters
    return ' '.join(final)

In [16]:
preprocess_data(df['merged_TSD'][0]) # testing

'software engineering google lesson learned programming time today  software engineer need know program effectively also develop proper engineering practice make codebase sustainable healthy  book emphasizes difference programming software engineering  software engineer manage living codebase evolves responds '

In [17]:
df['processed_TSD'] = df['merged_TSD'].apply(preprocess_data)

In [18]:
df['processed_TSD'].head()

0    software engineering google lesson learned pro...
1    software architect elevator redefining archite...
2    software architecture metric unk software arch...
3    designing hexagonal architecture java build ma...
4    openstack architect design productionready pri...
Name: processed_TSD, dtype: object

In [19]:
# exporting the processed dataset
df.to_csv('pre_processed_data.csv',index_label=False)