In [1]:
import pandas as pd
import numpy as np

# データ取得

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data = data[['jap_question_1_wakati', 'jap_question_2_wakati', 'is_duplicate']]

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 3 columns):
jap_question_1_wakati    24000 non-null object
jap_question_2_wakati    24000 non-null object
is_duplicate             24000 non-null int64
dtypes: int64(1), object(2)
memory usage: 562.6+ KB


# tf-idfの取得
特徴量として使うためにtf-idfを計算  
2つの入力文それぞれに対してtf-idf求め、2つを連結させて特徴量とする

In [5]:
data_all = pd.concat([data['jap_question_1_wakati'], data['jap_question_1_wakati']], axis=0
                    ).to_numpy()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vec_tfidf = TfidfVectorizer()

vec_tfidf.fit(data_all)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
data1 = vec_tfidf.transform(data['jap_question_1_wakati'].to_numpy())
data2 = vec_tfidf.transform(data['jap_question_2_wakati'].to_numpy())

assert data1.shape == data2.shape and data1.shape[0] == len(data)

In [9]:
from scipy.sparse import hstack
X = hstack([data1, data2])

assert X.shape == (data1.shape[0], data1.shape[1] * 2)

In [10]:
y = data[['is_duplicate']].to_numpy()

assert y.dtype == np.int64

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=123)

In [16]:
X_train.shape[0] / X.shape[0]

0.75

# model definition
random forestで2値分類(pair or not)

In [28]:
model = RandomForestClassifier()

In [29]:
model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

In [34]:
print('train acc: ', train_score)
print('test acc:', test_score)

train acc:  0.9772777777777778
test acc: 0.7186666666666667


# classification_report
accuracy, precision, recall, f1-scoreなどを算出

In [None]:
from sklearn.metrics import classification_report

In [44]:
# train metrics
y_pred = model.predict(X_train)
rep_train = classification_report(y_train, y_pred)

print(rep_train)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     11324
           1       1.00      0.94      0.97      6676

    accuracy                           0.98     18000
   macro avg       0.98      0.97      0.98     18000
weighted avg       0.98      0.98      0.98     18000



In [45]:
# test metrics
y_pred = model.predict(X_test)
rep = classification_report(y_test, y_pred)

print(rep)

              precision    recall  f1-score   support

           0       0.72      0.91      0.80      3757
           1       0.72      0.40      0.51      2243

    accuracy                           0.72      6000
   macro avg       0.72      0.65      0.66      6000
weighted avg       0.72      0.72      0.69      6000

