In [None]:
# ワーニングメッセージを非表示にする
import warnings
warnings.simplefilter("ignore")

# データ分析によく使われるライブラリをインポートする
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Jupyter NotebookでMatplotlibのグラフを表示するための設定
%matplotlib inline

# 機械学習モデルの作成に必要なライブラリをインポートする
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix
from sklearn.metrics import roc_curve
from sklearn import tree

# 以降のプログラムで使うための、各種ライブラリのインポートが完了した
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('./original/train.csv')
test = pd.read_csv('./original/test.csv')
sample = pd.read_csv('./original/sample_submission.csv')

In [None]:
print('Train:',train.shape,"Test:",test.shape,"Sample:",sample.shape)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sample.info()

In [None]:
train.head()

In [None]:
# MatplotlibのFigureを作成する。グラフのサイズを指定する。
plt.figure(figsize=(7,5))

# Seabornのcountplotを使って、train['y']の値の出現回数を数えて棒グラフで表示する。
sns.countplot(x=train['price_range'], data=train)

# グラフにタイトルを設定する。
plt.title('正解データの分布')

# グラフを表示する。
plt.show()

In [None]:
train_drop = train.copy()
train_drop = train_drop.drop(["id"], axis=1)
train_drop.corr()

In [None]:
# ヒートマップで可視化
plt.figure(figsize=(9, 5))
sns.heatmap(train_drop.corr())
plt.xticks(rotation=0)
plt.show()

In [None]:
use_columns = ["battery_power","blue","clock_speed","dual_sim","fc","four_g","int_memory","m_dep","mobile_wt","n_cores","pc","px_height","px_width","ram","sc_h","sc_w","talk_time","three_g","touch_screen","wifi"]
y = train["price_range"]
train = train[use_columns]
test = test[use_columns]

In [None]:
train.head()

In [None]:
ss = StandardScaler()
train_ss = pd.DataFrame(ss.fit_transform(train), columns=train.columns)
test_ss = pd.DataFrame(ss.transform(test), columns=test.columns)

In [None]:
# データの分割
X_train, X_valid, y_train, y_valid = train_test_split(train_ss, y, test_size=0.25, random_state=82, stratify=y)

In [None]:
# class_weightを設定することで不均衡データに対応
lr = LogisticRegression(random_state=82, class_weight='balanced')
lr.fit(X_train, y_train)

In [None]:
# 学習
lr.fit(X_train, y_train)
lr.fit(X_valid, y_valid)

# 予測・精度算出
train_pred = lr.predict(X_train)
valid_pred = lr.predict(X_valid)
print('train score : ', accuracy_score(y_train, train_pred))
print('valid score : ', accuracy_score(y_valid, valid_pred))

In [None]:
train_proba = lr.predict_proba(X_train)[:, 1]
valid_proba = lr.predict_proba(X_valid)[:, 1]

In [None]:
#AUCスコアでの精度
print("train AUC:", roc_auc_score(y_train, train_proba))
print("valid AUC:", roc_auc_score(y_valid, valid_proba))

In [None]:
#AUCスコアの可視化
fpr, tpr, thresholds = roc_curve(y_valid, valid_proba)
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f"ROC curve (area={roc_auc_score(y_valid, valid_proba).round(2)})")
plt.plot([0, 1.0], [0,1.0], linestyle="--", color="black")
plt.xlabel('False Positive Rate')
plt.ylabel('Ture Positive Rate')
plt.legend()
plt.grid()
plt.show()

In [None]:
# testデータの予測
predict = lr.predict_proba(test_ss)[:, 1]

In [None]:
# submit_sampleの読み込み、提出フォームの作成
submit = pd.read_csv('./original/sample_submission.csv', header=None)
submit[1] = predict[:len(submit)]
submit.to_csv('./submission/submission_lr.csv', header=None, index=False)

In [None]:
submit.head()