#### Marker auto-detection
#### Text preprocessing

#### Importing the modules:

In [1]:
# Data loading and plotting packages
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib

In [2]:
# Model metric packages 
#from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve, auc
from sklearn import metrics 

In [3]:
# Text pre-processing packages 
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import time 

#### Loading the root data into the system:

In [1]:
root_data=pd.read_excel("data path")
root_data.shape

In [537]:
root_data.head()

In [2]:
set(root_data)

root_data["target var"].value_counts()

In [3]:
# Removing the missing values from free format description

root_data=root_data[root_data["Free format Desc"].notnull()]

#### Removing the records of 'Natural hazard', 'Theft' and "Flood"

In [8]:
# Removing records which has srong words  in "Free format Desc" column

#Strong words-If these words are present then it is total loss 
strong_words=[evident words which could be generalised]

# Removing the strong word records in "Free format Desc" column
root_data1=root_data1[~root_data1['Free format Desc'].str.contains('|'.join(strong_words),case=False)]
print("Number of records removed based on strong words are ",root_data.shape[0]-root_data1.shape[0])

# Reset and drop the index 
root_data1.reset_index(inplace=True,drop=True)

# Result:Number of records removed based on strong words are  137

#### -----------------------------------------------------------------Text Data  Pre-processing-----------------------------------------------------------------------------------------

##### Step 1: Removal of punctuations and special characteristics and handling case senitivity 

In [7]:
#Removal of Special Characters and Punctuations 
root_data1['Free_format_Desc'] = root_data1['Free format Desc'] .str.replace('[^\w\s]','')

# converting into lower case 
root_data1['Free_format_Desc'] = root_data1['Free_format_Desc'].apply(lambda x: " ".join(x.lower() for x in x.split()))

##### Step 2: Elaborating Abbrevations

In [546]:
# class for replace the Abbrevations and spell corrections using word map 
class WordReplacer(object): 
 
    def __init__(self, word_map):
            self.word_map = word_map 
    def replace(self, word): 
         return self.word_map.get(word, word)

In [547]:
# function to replace abbrevations and spell corrections 
def replace_text(text):
    tokenized_text = word_tokenize(text.lower()) 
    replaced_text=[replacer.replace(word) for word in tokenized_text ]
    replaced_text =' '.join(replaced_text)
    return replaced_text

In [548]:
# word map to handle all the abbrevations and spell corrections that occured in metrics in textual data 
wordmap={
# Abbrevations labelling
    
}


In [549]:
# Replacing observations and spell corrections on "Free_format_updated" column
replacer=WordReplacer(wordmap)
root_data1['Free_format_Desc']=root_data1['Free_format_Desc'].apply(replace_text)

In [550]:
# Manual corrections of words which occured in metrics created from text data 
root_data1['Free_format_Desc_Updated'] = root_data1['Free_format_Desc_Updated'] .str.replace()

##### Step 3: Tokenazation and Removal of Stopwords 

In [551]:
# Sysytem defined stop-words-(Eliminated all negated stopwords(Eg: off,over,doesn't,not etc.,))
#sample structure
stop_words=stopwords.words('english')
rm_stop=['before', 'after', 'above', 'below', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'no',
         'nor', 'not', 'too', 'very', "don't", "aren't", "couldn't", 'didn', "didn't", 'doesn', "doesn't", "hadn't", "hasn't",
          "haven't", "wasn't", "weren't"]
stop_words= [word for word in stop_words if word  not in rm_stop]

# user defined stopwords(based on 1 gram analysis)
stop_words.extend(['number', 'plate','paint','presume','pour','car','vehicle','tp','policyholder','policy holder','ph','phv','insured','policy','holder',
                   #stopwords from unigrams
                   'tbc','go','ab','tl','ncrc','ahs','fr','cv','nar','ne','xs','lr',

                   # Stopwords taken from TFIDF 
'a','also','although','always','am','among','amongst','amoungst','amount','an','and','another','any','anyhow','anyone',
'anything','anyway','anywhere','are','around','as','at','be','became','because','become','becomes','becoming','been','being',
'between','bill','but','by','call','cry','de','describe','due','during','each','eg','either','eleven','else','even','ever',
'fifteen','fifty','for','former','formerly','forty','from','further','give','go','here','hereafter','hereby','herein','hereupon',
'hers','him','his','how','however','i','ie','if','inc','interest','is','it','its','itself','latter','latterly','ltd','made',
'may','me','meanwhile','might','mill','mine','my','myself','name','namely','nine','no','noone','now','often','once','our',
'perhaps','rather','re','same','see','seem','seemed','seeming','seems','she','show','since','sincere','six','sixty','so',
'sometime','sometimes','somewhere','still','such','system','take','ten','that','the','their','them','themselves','then',
'thence','there','thereafter','thereby','therefore','therein','thereupon','these','they','third','this','those','thus','twelve',
'twenty','un','until','us','was','we','were','what','whatever','when','whence','whenever','where','whereafter','whereas','whereby',
'wherein','whereupon','wherever','whether','which','who','whoever','whom','whose','why','will','would','you','your','yours',
'yourself','yourselves'])

print("Total number of customized stopwords are :",len(stop_words))

# Result: Total number of customized stopwords are : 337

Total number of customized stopwords are : 349


In [552]:
# Function to replace stopwords 
def clean_text(text):
   
    # Tokenization of text
    tokenized_text = word_tokenize(text.lower()) 
    
    # Removal of Stop words 
    cleaned_text =[t for t in tokenized_text if t not in stop_words and re.match('[a-zA-Z\\-][a-zA-Z\\-]{1,}', t) and t.isalpha()]
    #Join all words
    cleaned_text =' '.join(cleaned_text)
    return cleaned_text

In [553]:
# Replacing Stopwords in "Free_format_updated" column
root_data1['Free_format_Desc']=root_data1['Free_format_Desc'].apply(clean_text)

In [4]:
# Text before stopword removal and handling abbrevations 
root_data1['Free format Desc'].head(3)

In [5]:
# Text after stopword removal 
root_data1['Free_format_Desc_Updated'].head(3)

##### Step 4: Lemmatization of words 

In [556]:
# Function to perform Lematization
lmtzr = WordNetLemmatizer()
def clean_text_lemma(text):
        
    # Tokenization of text
    tokenized_text = word_tokenize(text.lower()) 
    
    # Lemmatization
    cleaned_text =[lmtzr.lemmatize(t,pos='v') for t in tokenized_text]
    
    #Joining all words
    cleaned_text =' '.join(cleaned_text)
    return cleaned_text

In [557]:
# Performing lematization on "Free_format_updated" column
root_data1['Free_format_Desc']=root_data1['Free_format_Desc'].apply(clean_text_lemma)

In [6]:
root_data1['Free_format_Desc_Updated'].head(3)

##### Step 5 :Removing the Records of strong words 

In [559]:
# Removing the strong word records in "free flow text" column
root_data1=root_data1[~root_data1['Free_format_Desc'].str.contains('|'.join(strong_words),case=False)]
root_data1.reset_index(inplace=True,drop=True)

#### -----------------------------------------------------------------------End of Text Data  Pre-Processing------------------------------------------------------------