In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#Read the dataset
train_dir ="../data/input/commonlitreadabilityprize/train.csv"
test_dir = "../data/input/commonlitreadabilityprize/test.csv"
submission_dir = "../data/input/commonlitreadabilityprize/sample_submission.csv"

train = pd.read_csv(train_dir)
print(train.shape)
test = pd.read_csv(test_dir)
print(test.shape)
submission = pd.read_csv(submission_dir)
print(submission.shape)


(2834, 6)
(7, 4)
(7, 2)


In [3]:
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
test.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


# Preprocessed data

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
import contractions, inflect, unicodedata, re
from bs4 import BeautifulSoup 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /home/jupyter-
[nltk_data]     tram.ho.jka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter-
[nltk_data]     tram.ho.jka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# build the preprocess
class TextPreprocess(object):   
    def __init__(self,
                do_remove_between_square_brackets = True,
                do_to_lower=True, 
                do_to_upper=False, 
                do_remove_number=False,
                do_remove_punctuation=True,
                do_split_on_white_space = True,
                do_remove_stopwords=False, 
                do_remove_short_tokens=False,
                do_remove_non_ascii_chars=True,
                do_replace_contractions=True,
                do_stemming = True,
                do_tokenize=False):
        
        self.do_remove_between_square_brackets=do_remove_between_square_brackets
        self.do_to_lower=do_to_lower 
        self.do_to_upper=do_to_upper 
        self.do_remove_number=do_remove_number
        self.do_remove_punctuation=do_remove_punctuation 
        self.do_split_on_white_space=do_split_on_white_space 
        self.do_remove_stopwords=do_remove_stopwords 
        self.do_remove_short_tokens=do_remove_short_tokens
        self.do_remove_non_ascii_chars=do_remove_non_ascii_chars
        self.do_replace_contractions=do_replace_contractions
        self.do_stemming = do_stemming
        self.do_tokenize=do_tokenize
                
        
    def remove_between_square_brackets(self, text):
        return re.sub('\[[^]]*\]', '', text)
    
    def to_lower(self, text):
        if type(text) is str:
            return text.lower()
    
    def to_upper(self, text):
        return text.upper()
    
    def remove_number(self, text):
        return re.sub(r'\d+', '', text)
    
    def remove_punctuation(self, text):
        return text.translate(str.maketrans('', '', string.punctuation))
    
    def split_on_white_space(self, text):
        if not text == None: 
            return text.split()
    
    def remove_stopwords(self, text):
        if not text == None:
            tokens = self.split_on_white_space(text)
            stop_words = set(stopwords.words('english'))
            tokens = [w for w in tokens if not w in stop_words]
            detokenizer = TreebankWordDetokenizer()
            text = detokenizer.detokenize(tokens)
            return text       
  

    def stem_sentence(self, text):
        porter=PorterStemmer()
        token_words=word_tokenize(text)
        token_words
        stem_sentence=[]
        for word in token_words:
            stem_sentence.append(porter.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)
    
    def remove_short_tokens(self, text):
        tokens = self.split_on_white_space(text)
        if tokens:
            tokens = [token for token in tokens if len(token) > 1]
            detokenizer = TreebankWordDetokenizer()
            text = detokenizer.detokenize(tokens)
            return text

    def remove_non_ascii_chars(self, text):
        return unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    def replace_contractions(self, text):
        return contractions.fix(text)
    
    def tokenize(self, text):
        tokenizer = TreebankWordTokenizer()
        tokens = tokenizer.tokenize(text)
        return tokens
    
    def clean_text(self, processed_text):
             
        if self.do_remove_between_square_brackets:
            processed_text = self.remove_between_square_brackets(processed_text)
    
        if self.do_remove_number:
            processed_text = self.remove_number(processed_text)
            
        if self.do_to_lower:
            processed_text = self.to_lower(processed_text)
            
        if self.do_to_upper:
            processed_text = self.to_upper(processed_text)
                          
        if self.do_remove_punctuation:
            processed_text = self.remove_punctuation(processed_text)
           
        if self.do_remove_stopwords:
            processed_text = self.remove_stopwords(processed_text)
            
        if self.do_remove_short_tokens:
            processed_text = self.remove_stopwords(processed_text)
            
        if self.do_remove_non_ascii_chars:
            processed_text = self.remove_non_ascii_chars(processed_text)
           
        if self.do_replace_contractions:
            processed_text = self.replace_contractions(processed_text)
        
        if self.do_stemming:
             processed_text = self.stem_sentence(processed_text)
            
        if self.do_tokenize:
            processed_text = self.tokenize(processed_text)
            
        return processed_text   

In [7]:
pp = TextPreprocess()

In [8]:
#Example:
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
pp.clean_text(sentence)

'he wa run and eat at same time he ha bad habit of swim after play long hour in the sun '

In [9]:
train['cleaned_text'] = train['excerpt'].apply(lambda x: pp.clean_text(x))

In [10]:
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,cleaned_text
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,when the young peopl return to the ballroom it...
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,all through dinner time mr fayr wa somewhat si...
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,as roger had predict the snow depart as quickl...
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,and outsid befor the palac a great garden wa w...
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,onc upon a time there were three bear who live...


In [11]:
test['cleaned_text'] = test['excerpt'].apply(lambda x: pp.clean_text(x))

# Vectorizing

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [14]:
X_train = train['cleaned_text'].values
X_test = test['cleaned_text'].values
print(X_train.shape)
print(X_test.shape)

y_train = train['target'].values
print(y_train.shape)

(2834,)
(7,)
(2834,)


In [15]:
X_train_vect = tfidf.fit_transform(X_train)
X_train_vect.shape

(2834, 20173)

In [16]:
X_test_vect = tfidf.transform(X_test)
X_test_vect.shape

(7, 20173)

# Modelling

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [18]:
lm = LinearRegression()
scores = cross_val_score(lm, X_train_vect, y_train, scoring='neg_root_mean_squared_error', cv=10)
scores     

array([-0.71371237, -0.82730776, -0.83080425, -0.77087401, -0.72405656,
       -0.82621282, -0.79068367, -0.82627301, -0.70381543, -0.81776739])

In [19]:
lm.fit( X_train_vect, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
submission['target'] = lm.predict(X_test_vect)


In [21]:
submission.head()

Unnamed: 0,id,target
0,c0f722661,-1.078905
1,f0953f0a5,-0.854981
2,0df072751,-0.096052
3,04caf4e0c,-1.645488
4,0e63f8bea,-0.95278


In [22]:
submission.to_csv("./submission.csv",index=False)
