# Webpage classification - Fetching data from HTML files

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from bs4 import BeautifulSoup
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
% matplotlib inline
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/karthik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/karthik/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [6]:
STOPWORDS = stopwords.words('english')
print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Fetch data

In [80]:
class fetch_data(object):
    """
    This class helps to fetch the data from
    the HTML files to a structured one.
    """
    
    def __init__(self, directory):
        """
        The constuctor of the class.
        
        Arguments:
        
        1. directory: The directory of the files.
        """
        
        self.data_frame = pd.DataFrame(columns = ["URL", "Text", "University", "Label"])
        # dataframe to load data into
        self.dir = directory
        # input files directory
        self.lemmatizer = WordNetLemmatizer()
        # lemmatizer object
        
    def __text_process(self, text):
        """
        This function helps to perform text processing
        by removing stop words, and lemmatizing the words.
        
        Arguments:
        
        1. self: The object.
        2. text: The input text to be processed.
        
        Return:
        1. text: Processed text.
        """
        
        text = text.replace("cs", "computer science")
        # replace cs with computer science
        text_lst = text.split(" ")
        # get list of words
        text_lst = [self.lemmatizer.lemmatize(word) for word in text_lst if word not in STOPWORDS and len(word) > 1]
        # process text
        return " ".join(text_lst)
        
    def __get_text(self, filename):
        """
        This function helps to get the text
        data, and the anchor text for a given
        HTML file name.
        
        Arguments:
        
        1. self: The object.
        2. filename: The filename from which the data
        is to be extracted.
        """
        
        with open(filename) as obj:
            # open file to read data
            try:
                data = obj.read()
                error = 0
            except:
                data = "read error"
                error = 1
            # read data from file
        data = re.sub('^[^<]+', "", data)
        #print(data)
        #print("________________________________________________________")
        # remove top headers from the file
        data_bs = BeautifulSoup(data)
        #print(data_bs)
        #print("________________________________________________________")
        # beautiful soup processed data
        text = data_bs.get_text()
        #print(text)
        #print("________________________________________________________")
        # get the text data alone Strip special characters and punctuation from a unicode string
        
        text = text.translate(dict.fromkeys(ord(c) for c in string.punctuation))
        # remove all punctuations from the data
        text = text.replace("\n", " ")
        # replace new line with space
        text = re.sub("\d", "", text)
        # remove all digits from the text
        text = re.sub("[\s]{2,}", " ", text).lower()
        # replace multiple space with single space
        text = self.__text_process(text)
        # func call to perform further text processing
        return text,error
        
            
    def __get_filename(self):
        """
        This function acts as a generator
        to yield filename with complete path
        which assists in further process.
        """
        
        index = 0
        # set index for the dataframe
        for path, _, file_lst in os.walk(self.dir):
            # iterate over sub-directories to the files
            for f in file_lst:
                # iterate over each file in the sub-directory
                self.data_frame.loc[index, "URL"] = f.strip("^")
                # add URL to the dataframe
                path_lst = path.split("/")
                #print(path_lst)
                # get path traversed in list
                self.data_frame.loc[index, "University"] = path_lst[-1]
                # add University to the dataframe
                self.data_frame.loc[index, "Label"] = path_lst[-2]
                # add the category to the dataframe
                yield index, os.path.join(path, f)
                # yield the index, and the file path
                index += 1
                # update index
        
    def get_data(self):
        """
        This function traverses the sub-directories
        to each filename, and fetches the required data
        after pre-processing.
        """
        
        file_generator = self.__get_filename()
        #print(file_generator)
        # generator to get filename
        count = 0
        for index, file_path in file_generator:
            #print(file_path)
            # iterate over index, and filepath
            text,error = self.__get_text(file_path)
            if error:
                count += 1
            # get text, and anchor contents of the file
            self.data_frame.loc[index, "Text"] = text.encode("utf8")
            # add text data to the dataframe
        print(count)

### Fetching data 

In [81]:
crawl = fetch_data(directory = "webkb")
# initialize object to extract data

# NOTE: The argument directory is subjective to change

In [82]:
crawl.get_data()
# fetch the required data

92


In [83]:
crawl.data_frame.to_csv(path_or_buf = "data.csv", index = False)
# write data to CSV file named "data.csv"