In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

from data_reader import read_data
from data_preprocessing import preprocess_data
from feature_extractor import extract_features, normalize

%matplotlib inline

In [None]:
data = preprocess_data(read_data())
data.head()

In [None]:
plt.scatter(data['RubPrice'], data['SuppliersCount'])
plt.title("Все данные")
plt.xlabel("цена, руб")
plt.ylabel("количество поставщиков")
plt.show()

In [None]:
data_nn = data[(~np.isnan(data['RubPrice'])) & (~np.isnan(data['SuppliersCount']))]
data95 = data_nn[(data_nn['RubPrice'] < np.percentile(data_nn['RubPrice'], 99.9)) &
                 (data_nn['SuppliersCount'] < np.percentile(data_nn['SuppliersCount'], 99.9))]
plt.scatter(data95['RubPrice'], data95['SuppliersCount'])
plt.title("99,9 перцентиль")
plt.xlabel("цена, руб")
plt.ylabel("количество поставщиков")
plt.show()

In [None]:
for i in range(50):
    print(len(data[data['SuppliersCount'] == i]))

In [None]:
len(data['CurrencyCode'].unique())

In [None]:
len(data['ProcedureDisplayName'].unique())

In [None]:
data_unbalanced = data.head(1000000)
features_unbalanced = extract_features(data_unbalanced)

In [None]:
def ngrams(data, column):
    return data.columns[data.columns.str.startswith(f'Ngrams_{column}_')].tolist()

def onehots(data, column):
    return data.columns[data.columns.str.startswith(f'OneHot_{column}_')].tolist()

In [None]:
X = features_unbalanced[ngrams(features_unbalanced, 'Title') + ['RubPrice']]
y = data_unbalanced['SuppliersCount']
len(X), len(y)

In [None]:
cross_val_score(LinearRegression(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
cross_val_score(DummyRegressor(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
# from sklearn.svm import LinearSVR
# cross_val_score(LinearSVR(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# cross_val_score(RandomForestRegressor(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
X = features_unbalanced[ngrams(features_unbalanced, 'Title') + ngrams(features_unbalanced, 'ProcedureDisplayName')]
cross_val_score(Ridge(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
X = features_unbalanced[ngrams(features_unbalanced, 'Title') + onehots(features_unbalanced, 'ProcedureDisplayName')]
cross_val_score(Ridge(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
# cross_val_score(Ridge(), X, y, cv=3, scoring='neg_mean_absolute_error').mean()

In [None]:
# X = features_unbalanced[ngrams(features_unbalanced, 'Title') + ngrams(features_unbalanced, 'ProcedureDisplayName')]
# cross_val_score(Lasso(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
# X = features_unbalanced[ngrams(features_unbalanced, 'Title') + onehots(features_unbalanced, 'ProcedureDisplayName')]
# cross_val_score(Lasso(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
# param_grid = {'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
# optimizer = GridSearchCV(Ridge(), param_grid, scoring='neg_mean_squared_error', cv=3)
# optimizer.fit(X, y)
# optimizer.best_estimator_.coef_, optimizer.best_score_

In [None]:
# X = features_unbalanced[ngrams(features_unbalanced, 'Title') + onehots(features_unbalanced, 'ProcedureDisplayName') + ['RubPrice']]
# cross_val_score(Ridge(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
g = data.groupby('SuppliersCount')
data_balanced = g.apply(lambda x: x.head(8095)).sample(frac=1).reset_index(drop=True)
data_balanced

In [None]:
features_balanced = extract_features(data_balanced)
X = features_balanced[ngrams(features_balanced, 'Title') + ['RubPrice']]
y = data_balanced['SuppliersCount']

In [None]:
cross_val_score(DummyRegressor(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
X = features_balanced[ngrams(features_balanced, 'Title') + onehots(features_balanced, 'ProcedureDisplayName')]
cross_val_score(Ridge(), X, y, cv=3, scoring='neg_mean_squared_error').mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123456)
clf = Ridge()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
mean_squared_error(pred, y_test)

In [None]:
data.head()

In [1]:
from feature_extractor import normalize
import pandas as pd

In [2]:
data = pd.read_csv('result.cropped.10k.tsv', sep='\t', dtype=str)

In [3]:
normalize(data, ['Title'])

In [4]:
data

Unnamed: 0,id,Title,Uri,PublicationDateTimeUTC,LawCode,LawDisplayName,ProcedureDisplayName,Amount,RubPrice,CurrencyCode,Nds,StatusDisplayName,StatusCode,SuppliersCount,IsWinner
0,0100600000117000001,закупка товар работа услуга в соответствие с п...,https://zakupki.kontur.ru/0100600000117000001,2017-04-13,1,44-ФЗ,Закупка у единственного поставщика (подрядчика...,1310000,,RUB,,Размещение завершено,2.0,,
1,0100600000117000002,закупка товар работа услуга в соответствие с п...,https://zakupki.kontur.ru/0100600000117000002,2017-04-13,1,44-ФЗ,Закупка у единственного поставщика (подрядчика...,249000,,RUB,,Размещение завершено,2.0,,
2,0100600000117000003,закупка товар работа услуга в соответствие с п...,https://zakupki.kontur.ru/0100600000117000003,2017-07-06,1,44-ФЗ,Закупка у единственного поставщика (подрядчика...,250000,,RUB,,Размещение завершено,2.0,,
3,0100600000117000004,выполнение работа по ремонт автомобильный доро...,https://zakupki.kontur.ru/0100600000117000004,2017-07-06,1,44-ФЗ,Запрос котировок,400300,,RUB,,Размещение завершено,2.0,2.0,1.0
4,0100600000117000005,приобретение и установка спортивный оборудование,https://zakupki.kontur.ru/0100600000117000005,2017-07-10,1,44-ФЗ,Запрос котировок,451000,,RUB,,Размещение завершено,2.0,1.0,0.0
5,0100600000117000006,ремонт и благоустройство воинский захоронение ...,https://zakupki.kontur.ru/0100600000117000006,2017-07-12,1,44-ФЗ,Электронный аукцион,825917,,RUB,,Размещение завершено,2.0,1.0,0.0
6,0100600000117000007,ремонт автомобильный дорога в с копорье,https://zakupki.kontur.ru/0100600000117000007,2017-07-14,1,44-ФЗ,Электронный аукцион,52400614,,RUB,,Размещение завершено,2.0,1.0,0.0
7,0100600000117000008,услуга по капитальный ремонт,https://zakupki.kontur.ru/0100600000117000008,2017-07-27,1,44-ФЗ,Закупка у единственного поставщика (подрядчика...,352000,,RUB,,Размещение отменено,3.0,,
8,0100600000117000009,профилирование грунтовый дорога с подсыпка в н...,https://zakupki.kontur.ru/0100600000117000009,2017-08-15,1,44-ФЗ,Электронный аукцион,1115400,,RUB,,Размещение завершено,2.0,1.0,0.0
9,0100600000117000010,установка энергосберегающий фонарь уличный осв...,https://zakupki.kontur.ru/0100600000117000010,2017-08-15,1,44-ФЗ,Запрос котировок,210000,,RUB,,Размещение завершено,2.0,3.0,1.0
