#70

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip

--2023-06-17 03:51:53--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘NewsAggregatorDataset.zip’

NewsAggregatorDatas     [         <=>        ]  27.87M  10.8MB/s    in 2.6s    

2023-06-17 03:51:56 (10.8 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203]

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [3]:
# 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換、'\''はエスケープされたシングルクオーテーション
!sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# データの読込
df = pd.read_csv('./newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割,train,testの順に返す。stratifyで均等に分割したい要素を指定(多くは正解データ)
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])

# データの保存
train.to_csv('./train.txt', sep='\t', index=False)
valid.to_csv('./valid.txt', sep='\t', index=False)
test.to_csv('./test.txt', sep='\t', index=False)

# 事例数の確認
print('【学習データ】')
print(train['CATEGORY'].value_counts())
print('【検証データ】')
print(valid['CATEGORY'].value_counts())
print('【評価データ】')
print(test['CATEGORY'].value_counts())


【学習データ】
b    4501
e    4235
t    1220
m     728
Name: CATEGORY, dtype: int64
【検証データ】
b    563
e    529
t    153
m     91
Name: CATEGORY, dtype: int64
【評価データ】
b    563
e    530
t    152
m     91
Name: CATEGORY, dtype: int64


In [5]:
#このセルでは特徴量の次元揃えるためにfeature_names(別名nameoftrain)を作成、試しに全データの特徴量を表示
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
nameoftrain = ""
corpus=[]

for index, tra in df.iterrows():
  corpus.append(tra['TITLE'])
vectorizer = TfidfVectorizer()#tfidfを用いた特徴量

vec = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
nameoftrain = feature_names
feature = pd.DataFrame(vec.toarray(), columns=feature_names)
display(feature)

Unnamed: 0,00,07,08,09,0ff,0ut,10,100,1000,10000,...,zone,zooey,zoosk,zuckerberg,zynga,zâ,œf,œlousyâ,œpiece,œwaist
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#特徴量生成関数を定義ただし08では使用しない
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def feature_value(t):
  corpus=[]

  for index, tra in t.iterrows():
    corpus.append(tra['TITLE'])
  vectorizer = TfidfVectorizer()#tfidfを用いた特徴量

  vec = vectorizer.fit_transform(corpus)
  feature_names = vectorizer.get_feature_names_out()
  feature = pd.DataFrame(vec.toarray(), columns=feature_names)
  for i in nameoftrain:#次元揃えるために全データの単語を扱う
    if not(i in  feature_names):
      feature[i] = 0
  display(feature)
  feature = feature.sort_index(axis=1)#後々のための昇順ソートしておく
  return feature

In [7]:
#こんな感じで使用
print(type(feature_value(train)))


  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] = 0
  feature[i] =

Unnamed: 0,00,07,08,09,0ff,0ut,10,100,1000,10000,...,wrists,xl,xu,yanking,yorker,yovanna,zack,zebras,zombies,zâ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>


In [8]:
import gdown
from gensim.models import KeyedVectors

# 学習済み単語ベクトルのダウンロード
url = "https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM"
output = 'GoogleNews-vectors-negative300.bin.gz'
gdown.download(url, output, quiet=True)

# ダウンロードファイルのロード
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [9]:
import string
import torch

def transform_w2v(text):
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  words = text.translate(table).split()  # 記号をスペースに置換後、スペースで分割してリスト化
  vec = [model[word] for word in words if word in model]  # 1語ずつベクトル化

  return torch.tensor(sum(vec) / len(vec))  # 平均ベクトルをTensor型に変換して出力

# 特徴ベクトルの作成
X_train = torch.stack([transform_w2v(text) for text in train['TITLE']])
X_valid = torch.stack([transform_w2v(text) for text in valid['TITLE']])
X_test = torch.stack([transform_w2v(text) for text in test['TITLE']])

print(X_train.size())
print(X_train)

torch.Size([10684, 300])
tensor([[ 0.0837,  0.0056,  0.0068,  ...,  0.0751,  0.0433, -0.0868],
        [ 0.0272,  0.0266, -0.0947,  ..., -0.1046, -0.0489, -0.0092],
        [ 0.0577, -0.0159, -0.0780,  ..., -0.0421,  0.1229,  0.0876],
        ...,
        [ 0.0392, -0.0052,  0.0686,  ..., -0.0175,  0.0061, -0.0224],
        [ 0.0798,  0.1017,  0.1066,  ..., -0.0752,  0.0623,  0.1138],
        [ 0.1664,  0.0451,  0.0508,  ..., -0.0531, -0.0183, -0.0039]])


In [10]:
#行列データのラベル生成
Y_train=[]
Y_valid=[]
Y_test=[]
for i in range(len(train['CATEGORY'])):
  if(train['CATEGORY'].iloc[i]=='b'):
    Y_train.append(0)
  if(train['CATEGORY'].iloc[i]=='t'):
    Y_train.append(1)
  if(train['CATEGORY'].iloc[i]=='e'):
    Y_train.append(2)
  if(train['CATEGORY'].iloc[i]=='m'):
    Y_train.append(3)
Y_train=pd.Series(Y_train)

for i in range(len(test['CATEGORY'])):
  if(test['CATEGORY'].iloc[i]=='b'):
    Y_test.append(0)
  if(test['CATEGORY'].iloc[i]=='t'):
    Y_test.append(1)
  if(test['CATEGORY'].iloc[i]=='e'):
    Y_test.append(2)
  if(test['CATEGORY'].iloc[i]=='m'):
    Y_test.append(3)
Y_test=pd.Series(Y_test)

for i in range(len(valid['CATEGORY'])):
  if(valid['CATEGORY'].iloc[i]=='b'):
    Y_valid.append(0)
  if(valid['CATEGORY'].iloc[i]=='t'):
    Y_valid.append(1)
  if(valid['CATEGORY'].iloc[i]=='e'):
    Y_valid.append(2)
  if(valid['CATEGORY'].iloc[i]=='m'):
    Y_valid.append(3)
Y_valid=pd.Series(Y_valid)

Y_train=torch.tensor(Y_train)
Y_test=torch.tensor(Y_test)
Y_valid=torch.tensor(Y_valid)

In [11]:
print(Y_train)
print(Y_train.size())

tensor([0, 1, 3,  ..., 0, 3, 2])
torch.Size([10684])


# 71

In [12]:
from torch import nn

class SLPNet(nn.Module):
  def __init__(self, input_size, output_size):
    super().__init__()
    self.fc = nn.Linear(input_size, output_size, bias=False)
    nn.init.normal_(self.fc.weight, 0.0, 1.0)  # 正規乱数で重みを初期化

  def forward(self, x):
    x = self.fc(x)
    return x

In [13]:
model = SLPNet(300, 4)  # 単層ニューラルネットワークの初期化
y_hat_1 = torch.softmax(model(X_train[:1]), dim=-1)#質問：このdim=-1は何なぜ-1
print(y_hat_1)

tensor([[0.1287, 0.7505, 0.0747, 0.0461]], grad_fn=<SoftmaxBackward0>)


In [14]:
Y_hat = torch.softmax(model.forward(X_train[:4]), dim=-1)
print(Y_hat)

tensor([[0.1287, 0.7505, 0.0747, 0.0461],
        [0.0611, 0.5965, 0.0326, 0.3098],
        [0.1074, 0.4652, 0.0252, 0.4023],
        [0.4371, 0.1155, 0.0192, 0.4282]], grad_fn=<SoftmaxBackward0>)


# 72

In [15]:
criterion = nn.CrossEntropyLoss()

l_1 = criterion(model(X_train[:1]), Y_train[:1])  # 入力ベクトルはsoftmax前の値
model.zero_grad()  # 勾配をゼロで初期化
l_1.backward()  # 勾配を計算
print(f'損失: {l_1:.4f}')
print(f'勾配:\n{model.fc.weight.grad}')

損失: 2.0502
勾配:
tensor([[-0.0729, -0.0049, -0.0059,  ..., -0.0655, -0.0377,  0.0756],
        [ 0.0628,  0.0042,  0.0051,  ...,  0.0564,  0.0325, -0.0651],
        [ 0.0062,  0.0004,  0.0005,  ...,  0.0056,  0.0032, -0.0065],
        [ 0.0039,  0.0003,  0.0003,  ...,  0.0035,  0.0020, -0.0040]])


In [16]:
l = criterion(model(X_train[:4]), Y_train[:4])
model.zero_grad()
l.backward()
print(f'損失: {l:.4f}')
print(f'勾配:\n{model.fc.weight.grad}')

損失: 1.8575
勾配:
tensor([[-0.0223,  0.0042,  0.0018,  ..., -0.0206, -0.0026,  0.0189],
        [ 0.0181, -0.0020,  0.0036,  ...,  0.0194,  0.0285, -0.0058],
        [ 0.0158, -0.0119, -0.0163,  ...,  0.0036, -0.0084,  0.0039],
        [-0.0115,  0.0098,  0.0110,  ..., -0.0024, -0.0175, -0.0170]])


# 73

In [17]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
  def __init__(self, X, y):  # datasetの構成要素を指定
    self.X = X
    self.y = y

  def __len__(self):  # len(dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, idx):  # dataset[idx]で返す値を指定
    return [self.X[idx], self.y[idx]]

In [18]:
from torch.utils.data import DataLoader

# Datasetの作成
dataset_train = NewsDataset(X_train, Y_train)
dataset_valid = NewsDataset(X_valid, Y_valid)
dataset_test = NewsDataset(X_test, Y_test)

# Dataloaderの作成
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False)


In [19]:
# モデルの定義
model = SLPNet(300, 4)

# 損失関数の定義
criterion = nn.CrossEntropyLoss()

# オプティマイザの定義
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

# 学習、エポックは全データを何周するか10だから学習データ１０周
num_epochs = 10
for epoch in range(num_epochs):
  # 訓練モードに設定
  model.train()
  loss_train = 0.0
  for i, (inputs, labels) in enumerate(dataloader_train):
    # 勾配をゼロで初期化
    optimizer.zero_grad()

    # 順伝播 + 誤差逆伝播 + 重み更新
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # 損失を記録
    loss_train += loss.item()

  # バッチ単位の平均損失計算
  loss_train = loss_train / i

  # 検証データの損失計算
  model.eval()
  with torch.no_grad():
    inputs, labels = next(iter(dataloader_valid))#１バッチ分取り出すため、バッチサイズの定義的に全ての評価データを取り出している
    outputs = model(inputs)
    loss_valid = criterion(outputs, labels)

  # ログを出力
  print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}')

epoch: 1, loss_train: 0.4720, loss_valid: 0.3539
epoch: 2, loss_train: 0.3104, loss_valid: 0.3228
epoch: 3, loss_train: 0.2791, loss_valid: 0.3112
epoch: 4, loss_train: 0.2640, loss_valid: 0.3149
epoch: 5, loss_train: 0.2544, loss_valid: 0.3047
epoch: 6, loss_train: 0.2473, loss_valid: 0.3063
epoch: 7, loss_train: 0.2420, loss_valid: 0.3037
epoch: 8, loss_train: 0.2363, loss_valid: 0.3055
epoch: 9, loss_train: 0.2346, loss_valid: 0.3032
epoch: 10, loss_train: 0.2318, loss_valid: 0.3100


# 74

In [20]:
def calculate_accuracy(model, loader):
  model.eval()
  total = 0
  correct = 0
  with torch.no_grad():
    for inputs, labels in loader:
      outputs = model(inputs)
      pred = torch.argmax(outputs, dim=-1)#テンソルの最後の次元を指定 dim=-1で
      total += len(inputs)
      correct += (pred == labels).sum().item()

  return correct / total



acc_train = calculate_accuracy(model, dataloader_train)
acc_test = calculate_accuracy(model, dataloader_test)
print(f'正解率（学習データ）：{acc_train:.3f}')
print(f'正解率（評価データ）：{acc_test:.3f}')

正解率（学習データ）：0.924
正解率（評価データ）：0.908


# 75

# 76

# 77

# 78

# 79