In [None]:
#Importing neessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import nltk
import seaborn as sns
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
import re

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/commonlit/train.csv")
data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


In [None]:
data.describe()

Unnamed: 0,target,standard_error
count,2834.0,2834.0
mean,-0.959319,0.491435
std,1.033579,0.034818
min,-3.676268,0.0
25%,-1.69032,0.468543
50%,-0.91219,0.484721
75%,-0.20254,0.506268
max,1.71139,0.649671


In [None]:
data.isnull().sum()
data.dropna(how="any",axis=1,inplace = True)
data.isnull().sum()

id                0
excerpt           0
target            0
standard_error    0
dtype: int64

In [None]:
columns = ['id', 'excerpt', 'target', 'standard_error']
data.corr()

Unnamed: 0,target,standard_error
target,1.0,-0.085981
standard_error,-0.085981,1.0


In [None]:
#Text pre processing
data["excerpt"] = data["excerpt"].str.lower()
data["excerpt"] = data["excerpt"].str.strip()

In [None]:
#tokenization
# Defining functions
def excerpt_tokens(row):
    excerpt = row['excerpt']
    tokens = word_tokenize(excerpt)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words
data['excerpt_tokens'] = data.apply(excerpt_tokens, axis=1)

In [None]:
# Remove stop words
stop_words = stopwords.words('english')
data['excerpt_tokens'] = data['excerpt_tokens'].apply(lambda x: ' '.join([w for w in x if w not in (stop_words)]))

In [None]:
data.head()

Unnamed: 0,id,excerpt,target,standard_error,excerpt_tokens
0,c12129c31,when the young people returned to the ballroom...,-0.340259,0.464009,young people returned ballroom presented decid...
1,85aa80a4c,"all through dinner time, mrs. fayre was somewh...",-0.315372,0.480805,dinner time fayre somewhat silent eyes resting...
2,b69ac6792,"as roger had predicted, the snow departed as q...",-0.580118,0.476676,roger predicted snow departed quickly came two...
3,dd1000b26,and outside before the palace a great garden w...,-1.054013,0.450007,outside palace great garden walled round fille...
4,37c1b32fb,once upon a time there were three bears who li...,0.247197,0.510845,upon time three bears lived together house woo...


In [None]:
#lemmatizing
wl = WordNetLemmatizer()
data['excerpt_tokens'] = data['excerpt_tokens'].apply(lambda x: [wl.lemmatize(str(word)) for word in x.split()])

data['excerpt']= data['excerpt_tokens'].apply(lambda x: ' '.join(x))

In [None]:
data.head()

Unnamed: 0,id,excerpt,target,standard_error,excerpt_tokens
0,c12129c31,young people returned ballroom presented decid...,-0.340259,0.464009,"[young, people, returned, ballroom, presented,..."
1,85aa80a4c,dinner time fayre somewhat silent eye resting ...,-0.315372,0.480805,"[dinner, time, fayre, somewhat, silent, eye, r..."
2,b69ac6792,roger predicted snow departed quickly came two...,-0.580118,0.476676,"[roger, predicted, snow, departed, quickly, ca..."
3,dd1000b26,outside palace great garden walled round fille...,-1.054013,0.450007,"[outside, palace, great, garden, walled, round..."
4,37c1b32fb,upon time three bear lived together house wood...,0.247197,0.510845,"[upon, time, three, bear, lived, together, hou..."


In [None]:
# Vectorizing data
cv = CountVectorizer()
excerpt_cv = cv.fit_transform(data['excerpt']).toarray()
excerpt_cv = pd.DataFrame(excerpt_cv, columns=cv.get_feature_names())

tfidv = TfidfVectorizer(max_features=50,min_df=1,max_df=0.7)
excerpt_tf = tfidv.fit_transform(data['excerpt']).toarray()
excerpt_tf = pd.DataFrame(excerpt_tf, columns=tfidv.get_feature_names())



In [None]:
data.head()

Unnamed: 0,id,excerpt,target,standard_error,excerpt_tokens
0,c12129c31,young people returned ballroom presented decid...,-0.340259,0.464009,"[young, people, returned, ballroom, presented,..."
1,85aa80a4c,dinner time fayre somewhat silent eye resting ...,-0.315372,0.480805,"[dinner, time, fayre, somewhat, silent, eye, r..."
2,b69ac6792,roger predicted snow departed quickly came two...,-0.580118,0.476676,"[roger, predicted, snow, departed, quickly, ca..."
3,dd1000b26,outside palace great garden walled round fille...,-1.054013,0.450007,"[outside, palace, great, garden, walled, round..."
4,37c1b32fb,upon time three bear lived together house wood...,0.247197,0.510845,"[upon, time, three, bear, lived, together, hou..."


In [None]:
#Model building

#Initializing target and features
X = data.standard_error
X = pd.concat([excerpt_tf, X], axis=1)
y = data['target']

In [None]:
# Initializing Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 7)

In [None]:
Gboost = GradientBoostingRegressor()
Gboost.fit(X_train, y_train)

GradientBoostingRegressor()

In [None]:
y_pred= Gboost.predict(X_test)