# Простая бейзлайн-модель

In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import pipeline, preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold
from sklearn import model_selection, metrics
from sklearn.feature_extraction.text import CountVectorizer

#import models
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier, RandomTreesEmbedding
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# from df_transformers import SelectColumnsTransfomer

%matplotlib inline

In [2]:
train_data = pd.read_csv("data/cleaned_train_data.csv", index_col="id")
test_data = pd.read_csv("data/cleaned_test_data.csv", index_col="id")
print(f"Shape of train data: {train_data.shape}. Shape of test data: {test_data.shape}")
train_data.head()

Shape of train data: (7613, 4). Shape of test data: (3263, 3)


Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,deed reason #earthquake may allah forgive u,1
4,,,forest fire near la ronge sask. canada,1
5,,,resident asked 'shelter place' notified office...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,got sent photo ruby #alaska smoke #wildfires p...,1


### Векторизация документов (извлечение фич)

In [3]:
count_vectorizer = CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_data["text"])
test_vectors = count_vectorizer.transform(test_data["text"])

### Разделяем выборку на трейн и тест

In [4]:
X = train_vectors
y = train_data['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Определим кроссвалидационную стратегию

In [6]:
seed = 3
scoring = 'f1'
n_folds = 7

def f1_cv(model):
    kfold = KFold(n_splits=n_folds, random_state=seed)
    return cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)

### Модель

In [7]:
clf = RidgeClassifier()

In [8]:
f1_cv(clf).mean()

0.7216137383720233

last result 0.7251916565998516

In [9]:
clf.fit(X, y)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [10]:
test_data['target'] = clf.predict(test_vectors)

In [11]:
test_data["target"].to_csv('ridge.csv', header=True)

### Модели

In [12]:
# источник - https://www.kaggle.com/kabure/eda-pipelines-v2-0-easy-to-begineers

clfs = []

clfs.append(("LogReg", 
             Pipeline([("LogReg", LogisticRegression())])))

clfs.append(("KNN", 
             Pipeline([("KNN", KNeighborsClassifier())]))) 

clfs.append(("DecisionTreeClassifier", 
             Pipeline([("DecisionTrees", DecisionTreeClassifier())]))) 

clfs.append(("RandomForestClassifier", 
             Pipeline([("RandomForest", RandomForestClassifier(n_estimators=100))]))) 

clfs.append(("GradientBoostingClassifier", 
             Pipeline([("GradientBoosting", GradientBoostingClassifier(n_estimators=100))]))) 

clfs.append(("RidgeClassifier", 
             Pipeline([("RidgeClassifier", RidgeClassifier())])))

clfs.append(("BaggingRidgeClassifier",
             Pipeline([("BaggingClassifier", BaggingClassifier())])))

clfs.append(("ExtraTreesClassifier",
             Pipeline([("ExtraTrees", ExtraTreesClassifier())])))

In [13]:
results, names  = [], [] 

for name, model  in clfs:
    cv_results = f1_cv(model)
    names.append(name)
    results.append(cv_results)    
    msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  cv_results.std())
    print(msg)

LogReg: 0.736900 (+/- 0.013458)
KNN: 0.429724 (+/- 0.026058)
DecisionTreeClassifier: 0.701058 (+/- 0.015594)
RandomForestClassifier: 0.716296 (+/- 0.015611)
GradientBoostingClassifier: 0.603051 (+/- 0.023572)
RidgeClassifier: 0.721456 (+/- 0.010208)
BaggingRidgeClassifier: 0.715513 (+/- 0.014002)
ExtraTreesClassifier: 0.699081 (+/- 0.024002)


### Логистическая регрессия

In [14]:
clf = LogisticRegression()

In [15]:
clf.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
test_data['target'] = clf.predict(test_vectors)

In [17]:
test_data["target"].to_csv('logreg.csv', header=True)

In [18]:
test_data['target'].shape

(3263,)