# All MSMO Data Cleaning

## Importing necessary libraries

In [1]:
import re
import os
from bs4 import BeautifulSoup
from tqdm import tqdm

### Contraction mapping dictionary

In [2]:
contraction_mapping = {"ain’t": "is not", "aren’t": "are not","can’t": "cannot", "’cause": "because", "could’ve": "could have", "couldn’t": "could not",
                          "didn’t": "did not", "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not", "hasn’t": "has not", "haven’t": "have not",
                          "he’d": "he would","he’ll": "he will", "he’s": "he is", "how’d": "how did", "how’d’y": "how do you", "how’ll": "how will", "how’s": "how is",
                          "I’d": "I would", "I’d’ve": "I would have", "I’ll": "I will", "I’ll’ve": "I will have","I’m": "I am", "I’ve": "I have", "i’d": "i would",
                          "i’d’ve": "i would have", "i’ll": "i will",  "i’ll’ve": "i will have","i’m": "i am", "i’ve": "i have", "isn’t": "is not", "it’d": "it would",
                          "it’d’ve": "it would have", "it’ll": "it will", "it’ll’ve": "it will have","it’s": "it is", "let’s": "let us", "ma’am": "madam",
                          "mayn’t": "may not", "might’ve": "might have","mightn’t": "might not","mightn’t’ve": "might not have", "must’ve": "must have",
                          "mustn’t": "must not", "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have","o’clock": "of the clock",
                          "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not", "shan’t’ve": "shall not have",
                          "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will", "she’ll’ve": "she will have", "she’s": "she is",
                          "should’ve": "should have", "shouldn’t": "should not", "shouldn’t’ve": "should not have", "so’ve": "so have","so’s": "so as",
                          "this’s": "this is","that’d": "that would", "that’d’ve": "that would have", "that’s": "that is", "there’d": "there would",
                          "there’d’ve": "there would have", "there’s": "there is", "here’s": "here is","they’d": "they would", "they’d’ve": "they would have",
                          "they’ll": "they will", "they’ll’ve": "they will have", "they’re": "they are", "they’ve": "they have", "to’ve": "to have",
                          "wasn’t": "was not", "we’d": "we would", "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are",
                          "we’ve": "we have", "weren’t": "were not", "what’ll": "what will", "what’ll’ve": "what will have", "what’re": "what are",
                          "what’s": "what is", "what’ve": "what have", "when’s": "when is", "when’ve": "when have", "where’d": "where did", "where’s": "where is",
                          "where’ve": "where have", "who’ll": "who will", "who’ll’ve": "who will have", "who’s": "who is", "who’ve": "who have",
                          "why’s": "why is", "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t’ve": "will not have",
                          "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have", "y’all": "you all",
                          "y’all’d": "you all would","y’all’d’ve": "you all would have","y’all’re": "you all are","y’all’ve": "you all have",
                          "you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have",
                          "you’re": "you are", "you’ve": "you have","ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                          "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                          "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                          "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                          "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                          "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                          "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                          "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                          "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                          "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                          "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                          "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                          "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                          "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                          "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                          "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                          "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                          "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                          "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                          "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                          "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                          "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                          "you're": "you are", "you've": "you have","n't":'not'}

## Data Cleaning function

In [3]:
def data_clean(doc,contraction_mapping):
    clean=[]
    for i in tqdm(doc):
        low=str(i).lower()
        soup=BeautifulSoup(low,'lxml')
        low=soup.text
        low=re.sub(" '","'",low)
        low=re.sub(" n't","n't",low)
        sent=[]
        for m in (low.split()):
            if m in contraction_mapping:
                sent.append(contraction_mapping[m])
            else:
                sent.append(m)
        jnt=' '.join(sent)
        low=re.sub("'s","",jnt)
        low=re.sub("’s","",low)
        new=re.sub("\n",'',low)
        new=re.sub(r'[\$\"\(\)\)\#\:\@\=\>\<\-\`\-\-\/\;\‘\£\%\*\—]',' ',new)
        new=re.sub(",",'',new)
        new=re.sub('\!','.',new)
        new=re.sub('\?','.',new)
        new=re.sub("'",'',new)
        new=re.sub("°",'',new)
        new=re.sub("\.\.\.",'.',new)
        new=re.sub(r"[^a-zA-Z0-9]",' ',new)
        new=(re.sub(r'[\s]+',' ',new)).strip()
        clean.append(new)
    return clean

## Cleaning Source and target Train Documents

In [4]:
print("Cleaning Source Training Data")

# Reading file
with open("train_document.txt",'r') as file:
    doc=file.readlines()
    
#Cleaning Data
final_data=data_clean(doc,contraction_mapping)

#Saving back to file
with open("src-train.txt",'w') as file:
    for summary in tqdm(final_data):
        file.write(summary+'\n')
        
print("Cleaning Target Training Data")

# Reading file
with open("train_title.txt",'r') as file:
    doc=file.readlines()
    
#Cleaning Data
final_data=data_clean(doc,contraction_mapping)

#Saving back to file
with open("tgt-train.txt",'w') as file:
    for summary in tqdm(final_data):
        file.write(summary+'\n')


Cleaning Source Training Data


100%|██████████| 293625/293625 [07:19<00:00, 668.69it/s] 
100%|██████████| 293625/293625 [00:01<00:00, 246522.56it/s]


Cleaning Target Training Data


100%|██████████| 293625/293625 [02:19<00:00, 2108.51it/s]
100%|██████████| 293625/293625 [00:00<00:00, 733552.95it/s]


## Cleaning Source and target Validation Documents

In [8]:
print("Cleaning Source Validation Data")

# Reading file
with open("val_document.txt",'r') as file:
    doc=file.readlines()
    
#Cleaning Data
final_data=data_clean(doc,contraction_mapping)

#Saving back to file
with open("src-val.txt",'w') as file:
    for summary in tqdm(final_data):
        file.write(summary+'\n')
        
print("Cleaning Target Validation Data")

# Reading file
with open("val_title.txt",'r') as file:
    doc=file.readlines()
    
#Cleaning Data
final_data=data_clean(doc,contraction_mapping)

#Saving back to file
with open("tgt-val.txt",'w') as file:
    for summary in tqdm(final_data):
        file.write(summary+'\n')


  1%|          | 105/10339 [00:00<00:09, 1045.53it/s]

Cleaning Source Validation Data


100%|██████████| 10339/10339 [00:13<00:00, 744.13it/s]
100%|██████████| 10339/10339 [00:00<00:00, 120617.90it/s]
  2%|▏         | 188/10339 [00:00<00:05, 1878.36it/s]

Cleaning Target Validation Data


100%|██████████| 10339/10339 [00:05<00:00, 2016.30it/s]
100%|██████████| 10339/10339 [00:00<00:00, 671553.71it/s]


## Cleaning Source and target Test Documents

In [9]:
print("Cleaning Source Test Data")

# Reading file
with open("test_document.txt",'r') as file:
    doc=file.readlines()
    
#Cleaning Data
final_data=data_clean(doc,contraction_mapping)

#Saving back to file
with open("src-test.txt",'w') as file:
    for summary in tqdm(final_data):
        file.write(summary+'\n')
        
print("Cleaning Target Source Data")

# Reading file
with open("test_title.txt",'r') as file:
    doc=file.readlines()
    
#Cleaning Data
final_data=data_clean(doc,contraction_mapping)

#Saving back to file
with open("tgt-test.txt",'w') as file:
    for summary in tqdm(final_data):
        file.write(summary+'\n')


  0%|          | 51/10245 [00:00<00:20, 505.54it/s]

Cleaning Source Test Data


100%|██████████| 10245/10245 [00:18<00:00, 560.57it/s]
100%|██████████| 10245/10245 [00:00<00:00, 100250.90it/s]
  2%|▏         | 179/10245 [00:00<00:05, 1782.84it/s]

Cleaning Target Source Data


100%|██████████| 10245/10245 [00:05<00:00, 1897.39it/s]
100%|██████████| 10245/10245 [00:00<00:00, 401527.26it/s]
