In [2]:
#knock50で保存した格データの読み込み
import pandas as pd
train = pd.read_csv('./news+aggregator/train.txt', sep="\t", header=0,names=["TITLE","CATEGORY"])
valid = pd.read_csv('./news+aggregator/valid.txt', sep="\t", header=0,names=["TITLE","CATEGORY"])
test = pd.read_csv('./news+aggregator/test.txt', sep="\t", header=0,names=["TITLE","CATEGORY"])
print(train)

                                                   TITLE CATEGORY
0      Justin Bieber Under Investigation For Attempte...        e
1      Exxon Report Claims World 'Highly Unlikely' To...        b
2      Jack White Records, Releases Single In Hours F...        e
3      President Barack Obama Releases Proclamation D...        t
4      Samsung Shares Steady After Chairman's Heart A...        m
...                                                  ...      ...
10667  J.K. Rowling Brings Her Magic To TV, HBO And B...        e
10668  UPDATE 2-Peace Corps pulls volunteers from Wes...        m
10669  GRAINS-US soybean prices climb on rising Chine...        b
10670  Seafood Fraud Under Fire As Lawmakers Look To ...        b
10671  A Hedge Fund Wants to Teach PetSmart Some New ...        b

[10672 rows x 2 columns]


In [15]:
import re
import pandas as pd
from nltk import stem

# データの結合
#DataFrame.concatでデータフレームを結合(axis=0で縦方向を指定)
df = pd.concat([train, valid, test], axis=0).reset_index(drop=True)

def preprocessing(text):
    # 記号の削除
    text_clean = re.sub(r'[\"\'.,:;\(\)#\|\*\+\!\?#$%&/\]\[\{\}]', '', text)
    # ' - 'みたいなつなぎ文字を削除
    text_clean = re.sub(r'\s-\s', ' ', text_clean)
    # 数字の正規化(全部0にする)
    text_clean = re.sub('[0-9]+', '0', text_clean)
    # 小文字化
    text_clean = text_clean.lower()
    # ステミングで語幹だけ取り出す
    stemmer = stem.PorterStemmer()
    res = [stemmer.stem(x) for x in text_clean.split()]
    return ' '.join(res)


df['TITLE'] = df['TITLE'].apply(preprocessing)
df.head()

Unnamed: 0,TITLE,CATEGORY
0,justin bieber under investig for attempt robbe...,e
1,exxon report claim world highli unlik to limit...,b
2,jack white record releas singl in hour for rec...,e
3,presid barack obama releas proclam declar june...,t
4,samsung share steadi after chairman heart attack,m


In [19]:
#tf-idf(文章の中での単語の重要度を表す統計量)を特徴量に採用
from sklearn.feature_extraction.text import TfidfVectorizer

# 文章中に登場する回数が10回以下のものは無視、1-gram, 2-gramでTfidfを計算
vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1, 2)) 

#0~1の範囲に正規化し、配列に格納
X = vectorizer.fit_transform(df['TITLE']).toarray()

#各次元に文字をベクトル化
X_df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())

#X_dfからindexを指定してデータを抽出
train_X = X_df.iloc[:len(train), :] # 訓練データの特徴量
valid_X = X_df.iloc[len(train):len(train)+ len(valid), :] # 評価データの特徴量
test_X = X_df.iloc[len(train)+ len(valid):, :] # テストデータの特徴量

#csvファイルに保存
train_X.to_csv('train.feature.txt', sep='\t', index=False)
valid_X.to_csv('valid.feature.txt', sep='\t', index=False)
test_X.to_csv('test.feature.txt', sep='\t', index=False)
train_X.head()

Unnamed: 0,0d,0k,0m,0million,0nd,0s,0st,0th,0th birthday,aa,...,your,your mother,yr,yr high,yuan,zac,zac efron,zendaya,zone,zone bond
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
