# Text Preprocessing:

Data cleaning is an important and intensive process in Data science which aids in data analysis and building machine learning models.

In [1788]:
import pandas as pd
import unidecode 
import re
import time 
import stopwords 
nltk.download('stopwords') 
from nltk.corpus import stopwords 
from nltk import word_tokenize
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import langid

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1789]:
def get_data_from_csv(url):
    df = pd.read_csv(url)
    return df

# Now we start preparing the data:

- Lowercase the text
- Remove links
- Remove non english language
- remove punctuation, stop words

https://github.com/kk7nc/Text_Classification

# Stop words
Text and document classification over social media, such as Twitter, Facebook, and so on is usually affected by the noisy nature (abbreviations, irregular forms) of the text corpuses.


# Capitalization
Sentences can contain a mixture of uppercase and lower case letters. Multiple sentences make up a text document. To reduce the problem space, the most common approach is to reduce everything to lower case. This brings all words in a document in same space, but it often changes the meaning of some words, such as "US" to "us" where first one represents the United States of America and second one is a pronoun. To solve this, slang and abbreviation converters can be applied.

# Noise Removal
Another issue of text cleaning as a pre-processing step is noise removal. Text documents generally contains characters like punctuations or special characters and they are not necessary for text mining or classification purposes. Although punctuation is critical to understand the meaning of the sentence, but it can affect the classification algorithms negatively.

In [1790]:
def cleaning_preprocessing_data_from_csv(data):  
    cleaned_data = []
    
    for text in data:

        # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
        formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')

        # Removing all the occurrences of links that starts with https
        formatted_text = re.sub(r'http\S+', '', formatted_text)

        # Remove all the occurrences of text that ends with .com
        formatted_text = re.sub(r"\ [A-Za-z]*\.com", " ", formatted_text)

        # Remove all whitespaces
        pattern = re.compile(r'\s+') 
        formatted_text = re.sub(pattern, ' ', formatted_text)
        formatted_text = formatted_text.replace('?', ' ? ').replace(')', ') ')

        # Remove accented characters from text using unidecode.
        # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. 
        remove_character = unidecode.unidecode(formatted_text)

        # Convert text to lower case
        lower_text = remove_character.lower()

        # Pattern matching for all case alphabets
        Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)

        # Limiting all the  repeatation to two characters.
        formatted_text = Pattern_alpha.sub(r"\1\1", lower_text) 

        # Pattern matching for all the punctuations that can occur
        Pattern_Punct = re.compile(r'(\'[.,/#!"$<>@[]^&%^&*?;:{}=_`~()+-])\1{1,}')

        # Limiting punctuations in previously formatted string to only one.
        Combined_Formatted = Pattern_Punct.sub(r'\1', formatted_text)

        # The below statement is replacing repeatation of spaces that occur more than two times with that of one occurrence.
        Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)

        # The formatted text after removing not necessary punctuations.
        Formatted_Text = re.sub(r"[^a-zA-Z]+", ' ', Final_Formatted) 

        # Text without stopwords
        remove_stop_words = repr(Formatted_Text)
        stoplist = stopwords.words('english') 

        # Append words to Medium.com
        stoplist.extend(['ago', 'followers', 'pinned', 'read', 'min', 'published', 'days', 'hours', 'the'])
            
        No_StopWords = [word for word in word_tokenize(remove_stop_words) if word.lower() not in stoplist ]

        # Convert list of tokens_without_stopwords to String type.
        words_string = No_StopWords[0]
        words_string = ' '.join(No_StopWords[1:]) 
        
        # Remove more stop words  
        final_text = remove_stopwords(words_string) 
        
        # Split the "'" from the edges
        cleaned_data.append(final_text[:len(final_text)-1])
        
    return cleaned_data

In [1791]:
def remove_non_english_articles_and_remove_duplicated_rows(df):
    # dropping ALL duplicate values (keep only one)
    df.drop_duplicates('content', inplace = True)
    
    for topic in df["topic"]:   
        DetectorFactory.seed = 0
        if detect(topic) != "en":
            print("Found different language: " + detect(topic))
            df.drop(df.index[(df["topic"] == topic)], axis=0, inplace=True)
            
    for content in df["content"]:
        DetectorFactory.seed = 0
        if detect(content) != "en":
            print("Found different language: " + detect(content))
            df.drop(df.index[(df["content"] == content)], axis=0, inplace=True) #axis 0 for rows

    return df

In [1792]:
def drop_rows_with_short_content(df):
    # split values by whitespace and drop data lt a word
    df = df[df["topic"].str.split().str.len() > 1]
    
    # split values by whitespace and drop data lt 3 words 
    df = df[df["content"].str.split().str.len() > 3]
    
    return df

## Keep the 'cleaned' data frame

In [1793]:
def insert_df_to_csv(topic, category, content, csv_file_name):
    new_df = pd.DataFrame({
    "topic": topic,
    "category": category,
    "content": content
    })
    new_df.to_csv(csv_file_name)
    return new_df

In [1794]:
df = get_data_from_csv("merge.csv")
print("Before cleaning data the shape of data frame is : " + str(df.shape))
df

Before cleaning data the shape of data frame is : (932, 1218)


Unnamed: 0.1,Unnamed: 0,topic,category,content,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 1208,Unnamed: 1209,Unnamed: 1210,Unnamed: 1211,Unnamed: 1212,Unnamed: 1213,Unnamed: 1214,Unnamed: 1215,Unnamed: 1216,Unnamed: 1217
0,7,The Importance the of Auditing Your Attention:...,business,1.3K Followers Pinned One of the most challen...,,,,,,,...,,,,,,,,,,
1,1,Even the Best Startup Leaders Have This Weakne...,business,19.1K Followers Pinned All the actionable stu...,,,,,,,...,,,,,,,,,,
2,2,Econ Made Me Jack Welch All My Friends,business,32 Followers 16 hours ago Or: How I learned t...,,,,,,,...,,,,,,,,,,
3,3,10 Skills That’ll Make You Money (In 2022),business,10.6K Followers Pinned How To Learn Faster By...,,,,,,,...,,,,,,,,,,
4,4,Your Biggest Risk Is You,business,647 Followers Published in Towards Data Scien...,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
927,874,Incorporating Heart Healthy Habits in Life,Health-and-Fitness,\n\t\t\t\t\tThe Heart Foundation suggests that...,,,,,,,...,,,,,,,,,,
928,875,Importance of Health in Our Lives,Health-and-Fitness,"\n\t\t\t\t\tIt is rightly said, ""Health is Wea...",,,,,,,...,,,,,,,,,,
929,876,The Importance Of Medical Devices And Automate...,Health-and-Fitness,"\n\t\t\t\t\tAccording to a survey report, all ...",,,,,,,...,,,,,,,,,,
930,877,What Are the Characteristics That Make a Hospi...,Health-and-Fitness,\n\t\t\t\t\tHospitals cater to the most vulner...,,,,,,,...,,,,,,,,,,


In [None]:
df = drop_rows_with_short_content(df)
df = remove_non_english_articles_and_remove_duplicated_rows(df)
title_list = cleaning_preprocessing_data_from_csv(df["topic"])
content_list = cleaning_preprocessing_data_from_csv(df["content"])

new_df = insert_df_to_csv(title_list, df["category"], content_list, "cleaned-scraping-data.csv")
print("After cleaning data the shape of data frame is : " + str(new_df.shape))
new_df

Found different language: de
Found different language: no
Found different language: nl
Found different language: de
Found different language: fr
Found different language: de
Found different language: da
Found different language: de
Found different language: de
Found different language: de
Found different language: no
Found different language: nl
Found different language: it
Found different language: af
Found different language: es
Found different language: ro
Found different language: af
Found different language: pt
Found different language: fr
Found different language: es
Found different language: ca
Found different language: af
Found different language: de
Found different language: it
Found different language: de
Found different language: de
Found different language: de
Found different language: de
Found different language: es
Found different language: hu
Found different language: de
Found different language: it
Found different language: nl
Found different language: de
Found differen