# Data cleaning and Preprocessing



In order to clean and preprocess the data, I created a data cleaning and preprocessing function with the following capabilities: 

- Lemmatization
- Stopword removal
- Lowercase
- Punctuation cleaning
- Emoji cleaning
- Number cleaning
- Weblinks cleaning
- Unnecessary spaces removal

I gave the user the freedom to choose which cleaning to apply by creating a unified function where every cleaning step is a boolean. For the purpose of this project, I do not lemmatize, remove stopwords, lowercase, and remove punctuations so that the summarization will still have its semantic context in place.

In [2]:
import pandas as pd 
import numpy as np 
import string 
import re 
import nltk 
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yassine/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/yassine/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/yassine/nltk_data...


True

In [7]:
# Individual cleaning functions
def remove_web_links(text):
  text = re.sub(r'http://www.\w+.org/','', text)
  text = re.sub(r'http://www.\w+.org/','', text)
  text = re.sub(r'http://www.([\w\S]+).org/\w+\W\w+','',text)
  text = re.sub(r'https://www.\w+.org/','', text)
  text = re.sub(r'https://www.([\w\S]+).org/\w+\W\w+','',text)
  text = re.sub(r'https://\w+.\w+/\d+.\d+/\w\d+\W\w+','',text)
  text = re.sub(r'https://\w+.\w+/\d+.\d+/\w\d+\W\w+','',text)
  text = re.sub(r'Figure\s\d:','', text)
  text = re.sub(r'\Wwww.\w+\W\w+\W','',text)
  text = re.sub("@[A-Za-z0-9]+", "", text)
  text = re.sub(r'www.\w+','',text)

  return text

def remove_emojis(text):
  regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"  # flags (iOS)
                           "]+", flags = re.UNICODE)
  text = regrex_pattern.sub('', text)

  return text

def remove_spaces(text):
  text = re.sub(r'\n',"",text)

  return text

def remove_stopwords(text):
  stop_words = set(stop.words('english'))
  words = word_tokenize(text)
  sentence = [w for w in words if w not in stop_words]

  return " ".join(sentence)

def lemmatize_text(text):
  wordlist = []
  lemmatizer = WordNetLemmatizer()
  sentences = sent_tokenize(text)
  for sentence in sentences:
    words = word_tokenize(sentence)
    for word in words: 
      wordlist.append(lemmatizer.lemmatize(word))
  return    ' '.join(wordlist)

def lowercase_text(text):
  return text.lower()

def remove_punctuations(text):
  additional_punctuations = ['’', '…'] # punctuations not in string.punctuation  
  for punctuation in string.punctuation:
    text = text.replace(punctuation, '')
  
  for punctuation in additional_punctuations:
    text = text.replace(punctuation, '')
    
  return text

def remove_numbers(text):
  if text is not None:
    text = text.replace(r'^\d+\.\s+','')
  
  text = re.sub("[0-9]", '', text)
  return text


In [8]:
# Unified boolean controlled cleaning function 
def clean_and_preprocess_data(text, lowercase=True, clean_stopwords=True, clean_punctuations=True, clean_links=True, 
                              clean_emojis=True, clean_spaces=True, clean_numbers=True,  lemmatize=True):
  
  if clean_stopwords == True:
    text = remove_stopwords(text)

  if clean_punctuations == True:
    text = remove_punctuations(text)
  
  if clean_links == True:
    text = remove_web_links(text)
  
  if clean_emojis == True:
    text = remove_emojis(text)
  
  if clean_spaces == True:
    text = remove_spaces(text)
  
  if clean_numbers == True:
    text = remove_numbers(text)
  
  if lemmatize == True:
    text = lemmatize_text(text)
  
  if lowercase == True:
    return text.lower()

  return text

# Preprocessing and cleaning the raw data

In [11]:
text_df = pd.read_csv("top1000.csv")


text_df['abstract'] = text_df['abstract'].fillna("").astype(str)
text_df['full_text'] = text_df['full_text'].fillna("").astype(str)
text_df['conclusion'] = text_df['conclusion'].fillna("").astype(str)


In [12]:

text_df['abstract'] = text_df['abstract'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))
text_df['full_text'] = text_df['full_text'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))
text_df['conclusion'] = text_df['conclusion'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))

print(text_df.head())

# Saving the processed data into a .csv file 

text_df.to_csv("top1000_cleaned.csv")

   Unnamed: 0                                           abstract  \
0           0  We apply statistical machine translation (SMT)...   
1           1  Parallel corpora have become an essential reso...   
2           2  The concept of maximum entropy can be traced b...   
3           3  We apply the hypothesis of “One Sense Per Disc...   
4           4  Transformation-based learning has been success...   

                                           full_text  \
0  We apply statistical machine translation (SMT)...   
1  Parallel corpora have become an essential reso...   
2  The concept of maximum entropy can be traced b...   
3  We apply the hypothesis of “One Sense Per Disc...   
4  Transformation-based learning has been success...   

                                          conclusion  
0  We presented a novel approach to the problem o...  
1  For each item, participants were instructed to...  
2  We began by introducing the building blocks of...  
3  The trigger labeling task descr