**Plotlar plotly oldugundan kaggle dan indirilen ibynb dosyasinda gozukmuyor, kaggle da kod calistirilinca ancak gozukuyor.**

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
import spacy
from tqdm.notebook import tqdm

## Import Data

In [2]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [3]:
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


Normal Tfidf ve count vectorizer cok kotu sonuclar veriyor. ancak spacy kutuphanesi sayesinde her cumleyi 300 luk vektorlere donusturuyoruz

## EDA

In [4]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy import stats

In [5]:
def plot_graphs(df,feature):
    (osm, osr), (slope, intercept, r) = stats.probplot(df[feature], plot=None)
    fig = make_subplots(
    rows=1, cols=2,

    subplot_titles=(
        "Quantile-Quantile Plot of " + feature,
        "Distribution Plot of " + feature
    )
    )


    fig.add_trace(go.Scatter(
    x=osm,
    y=slope*osm + intercept,
    mode='lines',
    line={
        'color': '#c81515',
        'width': 2.5
    }

    ), row=1, col=1)
    fig.add_trace(go.Scatter(
    x=osm,
    y=osr,
    mode='markers',
    marker={
        'color': '#496595'
    }
    ), row=1, col=1)
    fig1 = ff.create_distplot([df[feature]],['target'], 
                         bin_size=.05, show_rug=False)
    mean_value = df[feature].mean()
    median_value = df[feature].median()

    fig.add_trace(go.Scatter(
    fig1['data'][1],
    line=dict(

    width=1.5,
    ),
    fill='tozeroy'
    ),row=1,col=2)
    fig.add_annotation(
    yref="y domain",
    x=mean_value,
    y=0.5,
    axref="x",
    ayref="y domain",
    ax=mean_value + 0.2*mean_value,
    ay=0.1,
    text=f"<span>{feature.capitalize()} mean</span>= {round(mean_value,3)}",
    row=1,col=2)
    fig.add_annotation(
    yref="y domain",
    x=median_value,
    y=0.3,
    axref="x",
    ayref="y domain",
    ax=median_value + 0.2*median_value,
    ay=0.2,
    text=f"<span>{feature.capitalize()} median</span>= {round(median_value,3)}",
    row=1,col=2)
    fig.add_vline(
    x=mean_value, 
    line_width=2, 
    line_dash="dash",row=1,col=2
    )
    fig.add_vline(
    x=median_value, 
    line_width=2,line_dash="dash",line_color='red' ,row=1,col=2)

    fig.update_layout(showlegend=False)
    fig.show()

In [6]:
plot_graphs(train,'target')

## Processing Word2Vec

In [7]:
nlp = spacy.load('en_core_web_lg')
with nlp.disable_pipes():
    train_vec = np.array([nlp(text).vector for text in tqdm(train.excerpt)])

  0%|          | 0/2834 [00:00<?, ?it/s]

In [8]:
nlp(train.excerpt[0]).vector.shape

(300,)

In [9]:
with nlp.disable_pipes():
    test_vec = np.array([nlp(text).vector for text in tqdm(test.excerpt)])

  0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
y = train["target"].values

In [11]:
train_vec.shape, test_vec.shape

((2834, 300), (7, 300))

In [12]:
train_vec

array([[ 0.05494171,  0.1046926 , -0.12983888, ..., -0.02843743,
        -0.02245943,  0.068514  ],
       [-0.0147311 ,  0.21387874, -0.18390718, ...,  0.01283445,
         0.01573129,  0.00222419],
       [-0.00667108,  0.21706876, -0.12517808, ..., -0.0197907 ,
         0.02041968,  0.01643671],
       ...,
       [ 0.02947775,  0.16952522, -0.13579851, ..., -0.11207423,
         0.02451298,  0.0817313 ],
       [-0.004197  ,  0.19430712, -0.19940762, ..., -0.07639347,
         0.0401679 ,  0.01555117],
       [-0.08229643,  0.08151519, -0.20916262, ..., -0.11972358,
        -0.030373  , -0.05664112]], dtype=float32)

## Training and Prediction

In [13]:
NFOLDS = 10
skf = KFold(n_splits=NFOLDS)
folds = list(skf.split(train_vec))

In [14]:
train_vec[folds[5][0]].shape

(2551, 300)

In [15]:
folds[4][0]

array([   0,    1,    2, ..., 2831, 2832, 2833])

In [16]:
oof = np.zeros(y.shape)
pred = np.zeros((test_vec.shape[0],))
for idx in range(NFOLDS):
    print("FOLD: ", idx + 1)
    tr_idx, val_idx = folds[idx]
    reg = Ridge()
    #reg = LinearRegression()
    reg.fit(train_vec[tr_idx], y[tr_idx])
    oof[val_idx] =reg.predict(train_vec[val_idx])
    pred += reg.predict(test_vec) / NFOLDS

mse = mean_squared_error(y, oof, squared=False)
print("OOF MSE:",mse)

FOLD:  1
FOLD:  2
FOLD:  3
FOLD:  4
FOLD:  5
FOLD:  6
FOLD:  7
FOLD:  8
FOLD:  9
FOLD:  10
OOF MSE: 0.6598504703147183


## Regression Functions

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [18]:
def algo_test(x,y):
        L=LinearRegression()
        R=Ridge()
        Lass=Lasso()
        E=ElasticNet()
        ETR=ExtraTreeRegressor()
        GBR=GradientBoostingRegressor()
        kn=KNeighborsRegressor()
        dt=DecisionTreeRegressor()
        xgb=XGBRegressor()
        
        algos=[L,R,Lass,E,ETR,GBR,kn,dt,xgb]
        algo_names=['Linear','Ridge','Lasso','ElasticNet','Extra Tree','Gradient Boosting','KNeighborsRegressor','Decision Tree','XGBRegressor']
        
        x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.2,random_state=42)
        
        r_squared= []
        rmse= []
        mae= []
        
        result=pd.DataFrame(columns=['R_Squared','RMSE','MAE'],index=algo_names)
        
        for algo in algos:
            p=algo.fit(x_train,y_train).predict(x_test)
            r_squared.append(r2_score(y_test,p))
            rmse.append(mean_squared_error(y_test,p)**.5)
            mae.append(mean_absolute_error(y_test,p))
        
        result.R_Squared=r_squared
        result.RMSE=rmse
        result.MAE=mae
        
        rtable=result.sort_values('R_Squared',ascending=False)
        return rtable

In [19]:
algo_test(train_vec, y)

Unnamed: 0,R_Squared,RMSE,MAE
Ridge,0.601022,0.646347,0.510546
Gradient Boosting,0.578942,0.663991,0.5271
Linear,0.566348,0.673848,0.524867
XGBRegressor,0.520676,0.708445,0.565746
KNeighborsRegressor,0.474169,0.742018,0.597284
Decision Tree,0.018338,1.013847,0.802271
Lasso,-0.000616,1.023588,0.831197
ElasticNet,-0.000616,1.023588,0.831197
Extra Tree,-0.116447,1.081211,0.8597


## SUBMISSION

In [20]:
sub = test[["id"]].copy()
sub["target"] = pred

In [21]:
sub.head()

Unnamed: 0,id,target
0,c0f722661,-0.965339
1,f0953f0a5,-0.318864
2,0df072751,-0.636885
3,04caf4e0c,-2.322496
4,0e63f8bea,-1.574127


In [22]:
sub.to_csv("submission.csv", index=False)