In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
%%time
data = fetch_20newsgroups(subset='all', categories=['comp.graphics', 'sci.med'])

In [None]:
data['target_names']

In [None]:
texts = data['data']
target = data['target']

In [None]:
print(texts[0])

In [None]:
data['target_names'][target[0]]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(encoding='utf8', min_df=1)
vectorizer.fit(texts)

In [None]:
vectorizer.transform(texts[:1])

In [None]:
print(vectorizer.transform(texts[:1]).indices)
print(vectorizer.transform(texts[:1]).data)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(encoding='utf8', min_df=1)
vectorizer.fit(texts)

In [None]:
vectorizer.transform(texts[:1])

In [None]:
print(vectorizer.transform(texts[:1]).indices)
print(vectorizer.transform(texts[:1]).data)

In [None]:
import nltk
stemmer = nltk.stem.snowball.RussianStemmer()

In [None]:
print(stemmer.stem('машинное'), stemmer.stem('машинный'))

In [None]:
print(stemmer.stem('машина'), stemmer.stem('машины'))

In [None]:
stemmer = nltk.stem.snowball.EnglishStemmer()

def stem_text(text, stemmer):
    tokens = text.split()
    return ' '.join(map(lambda w: stemmer.stem(w), tokens))

stemmed_texts = []
for t in tqdm(texts[:1000]):
    stemmed_texts.append(stem_text(t, stemmer))

In [None]:
print(texts[0])

In [None]:
print(stemmed_texts[0])

In [None]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()

In [None]:
morph.parse('машинное')[0]

In [None]:
morph.parse('машинные')[0]

In [None]:
morph.parse('стали')

In [None]:
morph.parse('стали специалистом')

In [None]:
morph.parse('сплав стали')

Сравним работу стеммера и лемматизатора на примере:

In [None]:
stemmer = nltk.stem.snowball.RussianStemmer()
print(stemmer.stem('машинное'))

In [None]:
print(morph.parse('машинное')[0].normal_form)

In [None]:
print(stemmer.stem('машинная'))

In [None]:
print(morph.parse('машинная')[0].normal_form)

In [None]:
print(stemmer.stem('машина'))

In [None]:
print(morph.parse('машина')[0].normal_form)

In [None]:
data = pd.read_csv('house_prices.csv')

In [None]:
data.head()

In [None]:
data = data.drop(columns=['Id'])
y = data['SalePrice']
X = data.drop(columns=['SalePrice'])

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.distplot(y, label='target')
plt.title('target')

plt.subplot(1, 2, 2)
sns.distplot(data.GrLivArea, label='area')
plt.title('area')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=10)

numeric_data = X_train.select_dtypes([np.number])
numeric_data_mean = numeric_data.mean()
numeric_features = numeric_data.columns

X_train = X_train.fillna(numeric_data_mean)[numeric_features]
X_test = X_test.fillna(numeric_data_mean)[numeric_features]

In [None]:
sns.distplot(np.log(y+1), label='target')
plt.show()

In [None]:
model = Ridge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Test RMSE = %.4f' % mean_squared_error(y_test, y_pred) ** 0.5)

In [None]:
model = Ridge()
model.fit(X_train, np.log(y_train+1))
y_pred = np.exp(model.predict(X_test))-1

print('Test RMSE = %.4f' % mean_squared_error(y_test, y_pred) ** 0.5)

In [None]:
X_train['GrLivArea'] = np.log(X_train['GrLivArea'] + 1)
X_test['GrLivArea'] = np.log(X_test['GrLivArea'] + 1)

In [None]:
model = Ridge()
model.fit(X_train[numeric_features], y_train)
y_pred = model.predict(X_test[numeric_features])

print('Test RMSE = %.4f' % mean_squared_error(y_test, y_pred) ** 0.5)

In [None]:
model = Ridge()
model.fit(X_train[numeric_features], np.log(y_train+1))
y_pred = np.exp(model.predict(X_test[numeric_features]))-1

print('Test RMSE = %.4f' % mean_squared_error(y_test, y_pred) ** 0.5)

In [None]:
from sklearn.linear_model import LinearRegression

np.random.seed(36)
X = np.random.uniform(0, 1, size=100)
y = np.cos(1.5 * np.pi * X) + np.random.normal(scale=0.1, size=X.shape)

In [None]:
plt.scatter(X, y)

In [None]:
X = X.reshape((-1, 1))
thresholds = np.arange(0.2, 1.1, 0.2).reshape((1, -1))

X_expand = np.hstack((
    X,
    ((X > thresholds[:, :-1]) & (X <= thresholds[:, 1:])).astype(int)))

In [None]:
thresholds

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
-np.mean(cross_val_score(
    LinearRegression(), X, y, cv=KFold(n_splits=3),
    scoring='neg_mean_squared_error'))

In [None]:
-np.mean(cross_val_score(
    LinearRegression(), X_expand, y, cv=KFold(n_splits=3),
    scoring='neg_mean_squared_error'))

In [None]:
X

In [None]:
X_expand