# Boosting Classification 심화 실습 - 뉴스 분류하기

In [1]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(2021)

## 1. Data

이번 실습에서 사용할 데이터는 뉴스를 분류하는 데이터  


### 1.1 Data Load

In [3]:
from sklearn.datasets import fetch_20newsgroups

newsgroup = fetch_20newsgroups()

In [4]:
data, target = newsgroup["data"], newsgroup["target"]

In [5]:
print(data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [6]:
target[0]

7

In [7]:
newsgroup["target_names"]

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### 1.2 Data Split
아래의 뉴스 그룹만 사용
- talk.politics.guns
- talk.politics.mideast
- talk.politics.misc
- talk.religion.misc

In [8]:
len(newsgroup['target_names'])

20

In [9]:
text = pd.Series(data, name="text")
target = pd.Series(target, name="target")

In [11]:
df = pd.concat([text, target], 1)

  df = pd.concat([text, target], 1)


In [12]:
df

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [13]:
df.target.value_counts().sort_index()

0     480
1     584
2     591
3     590
4     578
5     593
6     585
7     594
8     598
9     597
10    600
11    595
12    591
13    594
14    593
15    599
16    546
17    564
18    465
19    377
Name: target, dtype: int64

In [31]:
df_sample =  df.query("0<= target <= 3")
df_sample

Unnamed: 0,text,target
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
7,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...,3
8,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...,2
15,From: mathew <mathew@mantis.co.uk>\nSubject: R...,0
16,From: ab@nova.cc.purdue.edu (Allen B)\nSubject...,1
...,...,...
11295,From: gt5735a@prism.gatech.EDU (Mark Devaney)\...,3
11297,From: CCMB <CCMB@MUSICA.MCGILL.CA>\nSubject: W...,3
11306,From: mrj@cs.su.oz.au (Mark James)\nSubject: R...,3
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3


In [32]:
data = df_sample.text
target = df_sample.target

In [33]:
np.array(data).shape

(2245,)

In [34]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_target, test_target = train_test_split(
    data, target, train_size=0.7, random_state=2021
)

## 1.2 Count Vectorize

In [35]:
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

뉴스에 모두 등장한 단어를 사용  

In [36]:
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize)
cnt_vectorizer.fit(train_data)



In [37]:
len(cnt_vectorizer.vocabulary_)

72054

최소 10개의 뉴스에 등장한 단어를 사용

In [38]:
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize, min_df=10)
cnt_vectorizer.fit(train_data)

In [39]:
len(cnt_vectorizer.vocabulary_)

2934

In [40]:
train_matrix = cnt_vectorizer.transform(train_data)
test_matrix = cnt_vectorizer.transform(test_data)

## 2. XGBoost

In [41]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()

### 2.1 학습

In [42]:
xgb_clf.fit(train_matrix, train_target)

### 2.2 예측

In [43]:
xgb_train_pred = xgb_clf.predict(train_matrix)
xgb_test_pred = xgb_clf.predict(test_matrix)

### 2.3 평가

In [44]:
from sklearn.metrics import accuracy_score

xgb_train_acc = accuracy_score(train_target, xgb_train_pred)
xgb_test_acc = accuracy_score(test_target, xgb_test_pred)

In [45]:
print(f"XGBoost Train accuracy is {xgb_train_acc:.4f}")
print(f"XGBoost Test accuracy is {xgb_test_acc:.4f}")

XGBoost Train accuracy is 1.0000
XGBoost Test accuracy is 0.8071


### 3. Light GBM

In [46]:
import lightgbm as lgb

lgb_clf = lgb.LGBMClassifier()

### 3.1 학습

In [47]:
train_matrix

<1571x2934 sparse matrix of type '<class 'numpy.int64'>'
	with 175510 stored elements in Compressed Sparse Row format>

In [49]:
train_matrix.toarray()

array([[   0,    0,    0, ...,   12,    0,    0],
       [   1,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [1187, 1183,  988, ...,    0,    0,    0],
       [   2,    0,    0, ...,    0,    0,    0]])

In [48]:
lgb_clf.fit(train_matrix.toarray(), train_target)

### 3.2 예측

In [50]:
lgb_train_pred = lgb_clf.predict(train_matrix.toarray())
lgb_test_pred = lgb_clf.predict(test_matrix.toarray())

### 3.3 평가

In [52]:
lgb_train_acc = accuracy_score(train_target, lgb_train_pred)
lgb_test_acc = accuracy_score(test_target, lgb_test_pred)

In [53]:
print(f"Light Boost Train accuracy is {lgb_train_acc:.4f}")
print(f"Light Boost Test accuracy is {lgb_test_acc:.4f}")

Light Boost Train accuracy is 1.0000
Light Boost Test accuracy is 0.8234


## 4. CatBoost

In [54]:
import catboost as cb

cb_clf = cb.CatBoostClassifier()

### 4.1 학습

In [56]:
cb_clf.fit(train_matrix, train_target, verbose=False)

<catboost.core.CatBoostClassifier at 0x7bd7bd54db10>

### 4.2 예측

In [57]:
cb_train_pred = cb_clf.predict(train_matrix)
cb_test_pred = cb_clf.predict(test_matrix)

### 4.3 평가

In [58]:
cb_train_acc = accuracy_score(train_target, cb_train_pred)
cb_test_acc = accuracy_score(test_target, cb_test_pred)

In [59]:
print(f"Cat Boost train accuracy is {cb_train_acc:.4f}")
print(f"Cat Boost test accuracy is {cb_test_acc:.4f}")

Cat Boost train accuracy is 0.9994
Cat Boost test accuracy is 0.8309


## 5. 마무리

In [60]:
print(f"XGBoost test accuracy is {xgb_test_acc:.4f}")
print(f"Light Boost test accuracy is {lgb_test_acc:.4f}")
print(f"Cat Boost test accuracy is {cb_test_acc:.4f}")

XGBoost test accuracy is 0.8071
Light Boost test accuracy is 0.8234
Cat Boost test accuracy is 0.8309
