# Cleaning data from csv

Data cleaning is an important and intensive process in Data science which aids in data analysis and building machine learning models.

In [858]:
import pandas as pd
import unidecode 
import re
import time 
import stopwords 
nltk.download('stopwords') 
from nltk.corpus import stopwords 
from nltk import word_tokenize
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import langid

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [859]:
def get_data_from_csv(url):
    return pd.read_csv(url)

In [860]:
def cleaning_preprocessing_data_from_csv(data):  
    cleaned_data = []
    
    for text in data:

        # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
        formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')

        # Removing all the occurrences of links that starts with https
        formatted_text = re.sub(r'http\S+', '', formatted_text)

        # Remove all the occurrences of text that ends with .com
        formatted_text = re.sub(r"\ [A-Za-z]*\.com", " ", formatted_text)

        # Remove all whitespaces
        pattern = re.compile(r'\s+') 
        formatted_text = re.sub(pattern, ' ', formatted_text)
        formatted_text = formatted_text.replace('?', ' ? ').replace(')', ') ')

        # Remove accented characters from text using unidecode.
        # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. 
        remove_character = unidecode.unidecode(formatted_text)

        # Convert text to lower case
        lower_text = remove_character.lower()

        # Pattern matching for all case alphabets
        Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)

        # Limiting all the  repeatation to two characters.
        formatted_text = Pattern_alpha.sub(r"\1\1", lower_text) 

        # Pattern matching for all the punctuations that can occur
        Pattern_Punct = re.compile(r'(\'[.,/#!"$<>@[]^&%^&*?;:{}=_`~()+-])\1{1,}')

        # Limiting punctuations in previously formatted string to only one.
        Combined_Formatted = Pattern_Punct.sub(r'\1', formatted_text)

        # The below statement is replacing repeatation of spaces that occur more than two times with that of one occurrence.
        Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)

        # The formatted text after removing not necessary punctuations.
        Formatted_Text = re.sub(r"[^a-zA-Z]+", ' ', Final_Formatted) 

        # Text without stopwords
        remove_stop_words = repr(Formatted_Text)
        stoplist = stopwords.words('english') 
        stoplist = set(stoplist)
        No_StopWords = [word for word in word_tokenize(remove_stop_words) if word.lower() not in stoplist ]

        # Convert list of tokens_without_stopwords to String type.
        words_string = ' '.join(No_StopWords) 

        # Remove more stop words  
        final_text = remove_stopwords(words_string) 
        
        # Split the "'" from the edges
        cleaned_data.append(final_text[1:len(final_text)-1])
        
    return cleaned_data

In [861]:
def remove_non_english_articles_and_remove_duplicated_rows(df):
    # Remove duplicated rows
    df.drop_duplicates(inplace=True)
    
    for topic in df["topic"]:   
        DetectorFactory.seed = 0
        if detect(topic) != "en":
            print("Found different language: " + detect(topic))
            df.drop(df.index[(df["topic"] == topic)], axis=0, inplace=True)
            
    for content in df["content"]:
        DetectorFactory.seed = 0
        if detect(content) != "en":
            print("Found different language: " + detect(content))
            df.drop(df.index[(df["content"] == content)], axis=0, inplace=True)        

    return df

In [862]:
def insert_df_to_csv(topic, category, content, csv_file_name):
    data = pd.DataFrame({
    "topic": topic,
    "category": category,
    "content": content
    })
    data.to_csv(csv_file_name)
    print(df.shape)

In [863]:
print("Start Cleaning Data!")
df = get_data_from_csv("example.csv")
df = remove_non_english_articles_and_remove_duplicated_rows(df)
title_list = cleaning_preprocessing_data_from_csv(df["topic"])
content_list = cleaning_preprocessing_data_from_csv(df["content"])
insert_df_to_csv(title_list, df["category"], content_list, "cleaned-scraping-data.csv")
print("Finish Cleaning Data!")

Start Cleaning Data!
Found different language: de
Found different language: no
Found different language: nl
(21, 4)
Finish Cleaning Data!
