In [1]:
!pip install gdown
!gdown --id 

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1
Downloading...
From: https://drive.google.com/uc?id=1bHascxVvFrqvEEs8KFXWorTnFHrDXHXH
To: /kaggle/working/news.csv
100%|██████████████████████████████████████| 15.1M/15.1M [00:00<00:00, 79.6MB/s]


In [2]:
!pip install sentence_transformers transformers 

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=d3f4edbafbaa0dc8b5c330563cfb9fe11de11a815fd0c2abd98874d43dc8358f
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2


In [3]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('all')



[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /usr/share/nltk_data...
[nltk_data]    | Downloading pa

True

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re, os
from tqdm.notebook import tqdm

from sklearn.cluster import KMeans

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics


from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F


plt.style.use('ggplot')

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    

seed_everything(42)
SEED = 42

In [6]:
data = pd.read_csv('./news.csv')
print(data.shape)
data.head()

(60000, 3)


Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [7]:
data['text'] = data['title'] + ':' + data['contents']

In [8]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    return text.lower()

In [9]:
data['processed_text'] = data['text'].apply(preprocess_text)
data.head()

Unnamed: 0,id,title,contents,text,processed_text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...,spanish coach facing action in race row:madrid...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w...","bruce lee statue for divided city:in bosnia, w..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive's tilda swinton talks a...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...,macromedia contributes to ebay stores:macromed...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...


In [10]:
tagged_corpus_list = []

for index, row in tqdm(data.iterrows(), total=len(data)):
    text = row['processed_text']
    tag = row['id']
    tagged_corpus_list.append(TaggedDocument(tags=[tag], words=nltk.word_tokenize(text)))

print('문서의 수 :', len(tagged_corpus_list))

  0%|          | 0/60000 [00:00<?, ?it/s]

문서의 수 : 60000


In [11]:
# doc2vec train
model = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, workers=8, window=8)
model.build_vocab(tagged_corpus_list)
model.train(tagged_corpus_list, total_examples=model.corpus_count, epochs=100)

In [12]:
# kmean cluster
kmeans = KMeans(n_clusters=6, init='k-means++', random_state=SEED, n_init='auto')
data['doc2vec_predict']= kmeans.fit_predict(model.dv.vectors)

In [13]:
data.head()

Unnamed: 0,id,title,contents,text,processed_text,doc2vec_predict
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...,spanish coach facing action in race row:madrid...,1
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w...","bruce lee statue for divided city:in bosnia, w...",1
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive's tilda swinton talks a...,2
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...,macromedia contributes to ebay stores:macromed...,3
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...,2


In [14]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(min_df = 5, ngram_range=(1,5))
tfidf_vector = tfidf_vectorizer.fit_transform(data['processed_text'].tolist())
transformer = Normalizer().fit_transform(tfidf_vector)


In [16]:
kmeans = KMeans(n_clusters=6,init='k-means++', random_state=SEED, n_init='auto')
data['tfidf_predict']= kmeans.fit_predict(transformer)

In [17]:
MODELS = {'Distill':'paraphrase-distilroberta-base-v1',
         'T5':'sentence-transformers/sentence-t5-base',
         'MiniLM':'sentence-transformers/all-MiniLM-L6-v2',
         'albert':'sentence-transformers/paraphrase-albert-base-v2',
         'glove':'sentence-transformers/average_word_embeddings_glove.840B.300d',
         'stsb':'sentence-transformers/stsb-mpnet-base-v2',
          'mpnet':'sentence-transformers/all-mpnet-base-v1'
         }

In [18]:
def embedding_cluster(model_name, model_type):
  
    model = SentenceTransformer(model_type)

    # 텍스트 feature 추출
    sentence_embeddings = model.encode(data['processed_text'].tolist())

    # 추출한 feature를 데이터프레임에 저장
    df_embeddings = pd.DataFrame(sentence_embeddings)
    
    kmeans = KMeans(n_clusters=6, init='k-means++', random_state=SEED, n_init='auto')
    
    data[f'{model_name}_predict']= kmeans.fit_predict(sentence_embeddings)
    
    return data

In [19]:
for k, v in MODELS.items():
    embedding_cluster(k, v)

Downloading (…)7f4ef/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f279f7f4ef/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)79f7f4ef/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)279f7f4ef/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7f4ef/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)279f7f4ef/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)9f7f4ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Downloading (…)2bb58/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading rust_model.ot:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)21dd52bb58/README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading (…)dd52bb58/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)52bb58/convert.ipynb:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Downloading (…)8/convert_to_fp16.py:   0%|          | 0.00/198 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)2bb58/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading (…)d52bb58/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Downloading (…)05c7e/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)769ac05c7e/README.md:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading (…)9ac05c7e/config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)05c7e/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Downloading (…)ac05c7e/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Downloading (…)fe988/.gitattributes:   0%|          | 0.00/846 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.64G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.4M [00:00<?, ?B/s]

Downloading (…)mbedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)68191fe988/README.md:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)91fe988/modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Downloading (…)0594e/.gitattributes:   0%|          | 0.00/868 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3f54a0594e/README.md:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Downloading (…)54a0594e/config.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)0594e/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)3f54a0594e/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)4a0594e/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

Downloading (…)b2106/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)711acb2106/README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

Downloading (…)1acb2106/config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)106/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)b2106/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)2106/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)711acb2106/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)acb2106/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1875 [00:00<?, ?it/s]

In [20]:
data.head()

Unnamed: 0,id,title,contents,text,processed_text,doc2vec_predict,tfidf_predict,Distill_predict,T5_predict,MiniLM_predict,albert_predict,glove_predict,stsb_predict,mpnet_predict
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...,spanish coach facing action in race row:madrid...,1,1,3,1,1,3,5,1,5
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w...","bruce lee statue for divided city:in bosnia, w...",1,1,4,2,5,1,5,1,4
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive's tilda swinton talks a...,2,1,1,5,2,2,3,1,1
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...,macromedia contributes to ebay stores:macromed...,3,3,2,0,0,4,2,0,2
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...,2,3,2,0,0,4,2,0,2


In [21]:
data.to_csv('./model_predicts.csv',index=False)

In [40]:
def check_predict(model_name):
    print('=================={}=================='.format(model_name))
    for label in range(6):
        print('-----------------------------------{:1d}----------------------------'.format(label))
        print(data[data[f'{model_name}_predict']==label]['text'].iloc[0])
        print()
        print(data[data[f'{model_name}_predict']==label]['text'].iloc[1])
        print()
        print(data[data[f'{model_name}_predict']==label]['text'].iloc[2])
        print()
        print(data[data[f'{model_name}_predict']==label]['text'].iloc[3])
        print()
        print(data[data[f'{model_name}_predict']==label]['text'].iloc[4])

In [30]:
data[data['doc2vec_predict']==0].head()

Unnamed: 0,id,title,contents,text,processed_text,doc2vec_predict,tfidf_predict,Distill_predict,T5_predict,MiniLM_predict,albert_predict,glove_predict,stsb_predict,mpnet_predict
2328,NEWS_02328,Networks to Test New Exit Polling System (AP),AP - Determined to avoid a repeat of high-prof...,Networks to Test New Exit Polling System (AP):...,networks to test new exit polling system (ap):...,0,4,2,0,0,4,2,0,3
37033,NEWS_37033,Irish company hit by Iraqi report,Shares in Irish oil company Petrel Resources h...,Irish company hit by Iraqi report:Shares in Ir...,irish company hit by iraqi report:shares in ir...,0,1,5,4,3,0,0,4,0
56422,NEWS_56422,U.S. Navy to Award Huge Satellite Contract Soon,WASHINGTON (Reuters) - The U.S. Navy will soon...,U.S. Navy to Award Huge Satellite Contract Soo...,u.s. navy to award huge satellite contract soo...,0,3,2,0,0,0,0,0,2


In [27]:
check_predict('doc2vec')

-----------------------------------0----------------------------
Networks to Test New Exit Polling System (AP):AP - Determined to avoid a repeat of high-profile failures in 2000 and 2002, television networks will rely on new systems on Nov. 2 to help project election winners and analyze why voters made their choices. And they have turned to The Associated Press to count the vote for them.

Irish company hit by Iraqi report:Shares in Irish oil company Petrel Resources have lost more than 50% of their value on a report that the firm has failed to win a contract in Iraq.  Reuters news agency reported that Iraq's Oil Ministry has awarded the first post-war oilfield contracts to a Canadian and a Turkish company. By 1700 GMT, Petrel's shares fell from 97p ($1.87) to 44p ($0.85). Petrel said that it has not received any information from Iraqi authorities to confirm or deny the report.  Iraq is seeking to award contracts for three projects, valued at $500m (Â£258.5m). Turkey's Everasia is repo

In [32]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 0,
    1: 5,
    2: 4,
    3: 2,
    4: 1,
    5: 3
}

data['doc2vec_mapping'] = data['doc2vec_predict'].apply(lambda x: mapping_dict[x])

In [33]:
check_predict('tfidf')

-----------------------------------0----------------------------
Three Palestinians killed in Rafah; Palestinians blast one Israeli &lt;b&gt;...&lt;/b&gt;:Three Palestinians were killed and other three injured one of them in a critical health condition by the fire of the Israeli occupation forces in al-Salam quarters to the south of Rafah near the border strip with Egypt.

Valiant &lt;em&gt;Reg&lt;/em&gt; readers save internet:&lt;strong&gt;Letters&lt;/strong&gt; Reports of demise premature

Rigel, Merck Form Development Partnership:NEW YORK (Reuters) - Biotechnology company Rigel  Pharmaceuticals Inc. &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=RIGL.O target=/stocks/quickinfo/fullquote"&gt;RIGL.O&lt;/A&gt; has entered into a collaboration  pact with drugmaker Merck   Co. Inc. &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=MRK.N target=/stocks/quickinfo/fullquote"&gt;MRK.N&lt;/A&gt; to find treatments  for cancer and potentially other diseases, 

In [34]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 5,
    1: 1,
    2: 0,
    3: 4,
    4: 3,
    5: 2
}

data['tfidf_mapping'] = data['tfidf_predict'].apply(lambda x: mapping_dict[x])

In [35]:
for k in MODELS.keys():
    print(k)

Distill
T5
MiniLM
albert
glove
stsb
mpnet


In [41]:
check_predict('Distill')

-----------------------------------0----------------------------
A Fair Way to Choose Candidates for Republican Debate://www.huffingtonpost.com/entry/a-fair-way-to-choose-cand_b_7922194.html short_description

Be on TOP://www.huffingtonpost.com/entry/be-on-top-amazon-best-sel_b_12508618.html short_description

Memo To EPA Chief Pruitt://www.huffingtonpost.com/entry/memo-to-epa-chief-pruitt-lets-end-subsidies-for-fossil_us_59ee9567e4b0b8a51417bcc6 short_description

Satire Will Not Save Us://www.huffingtonpost.com/entry/tal-fortgang-satire-will-not-save-us_b_5283369.html short_description

WATCH://www.huffingtonpost.com/entry/perrish-cox-flop-49ers-saints_n_6129774.html short_description
-----------------------------------1----------------------------
Only Lovers Left Alive's Tilda Swinton Talks About Almost Quitting Acting and Yasmine Hamdan Performs 'Hal' Live In NYC   (HuffPo Exclusive Videos) authors:Yasmine Hamdan performs 'Hal' which she also sings in the film during a scene when 

In [42]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 1,
    1: 2,
    2: 4,
    3: 3,
    4: 5,
    5: 0
}

data['distill_mapping'] = data['Distill_predict'].apply(lambda x: mapping_dict[x])

In [43]:
check_predict('T5')

-----------------------------------0----------------------------
Macromedia contributes to eBay Stores:Macromedia has announced a special version of its Contribute website editing application designed to simplify the creation and customisation of eBay Stores.

Qualcomm plans to phone it in on cellular repairs:Over-the-air fixes for cell phones comes to Qualcomm's CDMA.

Thomson to Back Both Blu-ray and HD-DVD:Company, one of the core backers of Blu-ray, will also support its rival format.

Deere's Color Is Green:With big tractors, big sales, and big earnings, Deere's hoeing a profitable row.

FTC Files First Lawsuit Against Spyware Concerns:The Federal Trade Commission formally announced yesterday its first assault against spyware - bits of computer code that surreptitiously install themselves on the computers of Internet users
-----------------------------------1----------------------------
Spanish coach facing action in race row:MADRID (AFP) - Spanish national team coach Luis Aragone

In [44]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 4,
    1: 2,
    2: 5,
    3: 3,
    4: 0,
    5: 1
}

data['t5_mapping'] = data['T5_predict'].apply(lambda x: mapping_dict[x])

In [45]:
check_predict('MiniLM')

-----------------------------------0----------------------------
Macromedia contributes to eBay Stores:Macromedia has announced a special version of its Contribute website editing application designed to simplify the creation and customisation of eBay Stores.

Qualcomm plans to phone it in on cellular repairs:Over-the-air fixes for cell phones comes to Qualcomm's CDMA.

Thomson to Back Both Blu-ray and HD-DVD:Company, one of the core backers of Blu-ray, will also support its rival format.

FTC Files First Lawsuit Against Spyware Concerns:The Federal Trade Commission formally announced yesterday its first assault against spyware - bits of computer code that surreptitiously install themselves on the computers of Internet users

Sony PSP Draws Crowds and Lines on First Day (Reuters):Reuters - Game fans stood in lines through a chilly\Tokyo night to be among the first in the world to get their\hands on Sony Corp.'s PlayStation Portable, the consumer\electronics firm's first handheld game m

In [46]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 4,
    1: 3,
    2: 1,
    3: 0,
    4: 2,
    5: 5
}

data['mini_mapping'] = data['MiniLM_predict'].apply(lambda x: mapping_dict[x])

In [47]:
check_predict('albert')

-----------------------------------0----------------------------
Time to Talk Baseball:It's time to talk about the serious risks and potential benefits of building an expensive ballpark in Washington.

Bump Stock Maker Resumes Sales One Month After Las Vegas Mass Shooting authors:Move along nothing to see here.

Congress Spikes Handout For Private Equity authors:A few Wall Street firms almost won big.

Deere's Color Is Green:With big tractors, big sales, and big earnings, Deere's hoeing a profitable row.

Deep Impact Space Probe Aims to Slam Into Comet (Reuters):Reuters - Astronomers plan to slam an\armchair-sized "impactor" into comet Tempel 1 to see what's\inside -- and possibly help future scientists determine how to\keep such space rocks from colliding with Earth.
-----------------------------------1----------------------------
Bruce Lee statue for divided city:In Bosnia, where one man #39;s hero is often another man #39;s villain, some citizens have decided to honour one whom Serb

In [48]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 0,
    1: 5,
    2: 1,
    3: 3,
    4: 4,
    5: 2
}

data['albert_mapping'] = data['albert_predict'].apply(lambda x: mapping_dict[x])

In [49]:
check_predict('glove')

-----------------------------------0----------------------------
Time to Talk Baseball:It's time to talk about the serious risks and potential benefits of building an expensive ballpark in Washington.

Bump Stock Maker Resumes Sales One Month After Las Vegas Mass Shooting authors:Move along nothing to see here.

Congress Spikes Handout For Private Equity authors:A few Wall Street firms almost won big.

Deere's Color Is Green:With big tractors, big sales, and big earnings, Deere's hoeing a profitable row.

Kmart-Sears merger about price, quality:Average customers know the same thing that ministers of high finance understand: It #39;s all about the price. Shoppers Thursday at the Billings Sears store were eager to find out what a proposed
-----------------------------------1----------------------------
Obama Marks Anniversary Of 9/11 Attacks With Moment Of Silence authors:We stand as strong as ever.

Republican Congressman Says Trump Should Apologize To Obama And The UK authors:Best not 

In [50]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 0,
    1: 2,
    2: 4,
    3: 1,
    4: 5,
    5: 3
}

data['glove_mapping'] = data['glove_predict'].apply(lambda x: mapping_dict[x])

In [51]:
check_predict('stsb')

-----------------------------------0----------------------------
Macromedia contributes to eBay Stores:Macromedia has announced a special version of its Contribute website editing application designed to simplify the creation and customisation of eBay Stores.

Qualcomm plans to phone it in on cellular repairs:Over-the-air fixes for cell phones comes to Qualcomm's CDMA.

Thomson to Back Both Blu-ray and HD-DVD:Company, one of the core backers of Blu-ray, will also support its rival format.

Obama Administration Helps Wall Street Criminals Dodge Accountability authors:Obama administration proposal would aid big banks that have pleaded guilty to felony antitrust charges.

Fischer's Fiancee: Marriage Plans Genuine (AP):AP - Former chess champion Bobby Fischer's announcement thathe is engaged to a Japanese woman could win him sympathy among Japanese officials and help him avoid deportation to the United States, his fiancee and one of his supporters said Tuesday.
----------------------------

In [52]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 4,
    1: 1,
    2: 5,
    3: 2,
    4: 0,
    5: 3
}

data['stsb_mapping'] = data['stsb_predict'].apply(lambda x: mapping_dict[x])

In [53]:
check_predict('mpnet')

-----------------------------------0----------------------------
Bump Stock Maker Resumes Sales One Month After Las Vegas Mass Shooting authors:Move along nothing to see here.

Deere's Color Is Green:With big tractors, big sales, and big earnings, Deere's hoeing a profitable row.

Kmart-Sears merger about price, quality:Average customers know the same thing that ministers of high finance understand: It #39;s all about the price. Shoppers Thursday at the Billings Sears store were eager to find out what a proposed

Oil Falls Below \$49 on Nigeria Cease-Fire:LONDON (Reuters) - Oil prices dropped from record highs  above \$50 a barrel on Wednesday as the U.S. government reported  a surprise increase in crude stocks and rebels in Nigeria's  oil-rich delta region agreed a cease-fire.

ABN Amro Profit Rises, Buoyed by Sale of Asia Stake (Update4):ABN Amro Holding NV, the largest Dutch bank, said profit rose 26 percent in the third quarter, buoyed by a gain from selling its stake in Bank of As

In [55]:
# '''
# 0: business
# 1: Entertainment
# 2: politics
# 3:sports
# 4:tech
# 5: world
# '''

mapping_dict = {
    0: 0,
    1: 1,
    2: 4,
    3: 2,
    4: 5,
    5: 3
}

data['mpnet_mapping'] = data['mpnet_predict'].apply(lambda x: mapping_dict[x])

In [56]:
data.head()

Unnamed: 0,id,title,contents,text,processed_text,doc2vec_predict,tfidf_predict,Distill_predict,T5_predict,MiniLM_predict,...,mpnet_predict,doc2vec_mapping,tfidf_mapping,distill_mapping,t5_mapping,mini_mapping,albert_mapping,glove_mapping,stsb_mapping,mpnet_mapping
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...,spanish coach facing action in race row:madrid...,1,1,3,1,1,...,5,5,1,3,2,3,3,3,1,3
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w...","bruce lee statue for divided city:in bosnia, w...",1,1,4,2,5,...,4,5,1,5,5,5,5,3,1,5
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive's tilda swinton talks a...,2,1,1,5,2,...,1,4,1,2,1,1,1,1,1,1
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...,macromedia contributes to ebay stores:macromed...,3,3,2,0,0,...,2,2,4,4,4,4,4,4,4,4
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...,2,3,2,0,0,...,2,4,4,4,4,4,4,4,4,4


In [57]:
data['final'] = data.iloc[:,-9:].mode(axis=1)[0].astype(int)

In [58]:
data.head()

Unnamed: 0,id,title,contents,text,processed_text,doc2vec_predict,tfidf_predict,Distill_predict,T5_predict,MiniLM_predict,...,doc2vec_mapping,tfidf_mapping,distill_mapping,t5_mapping,mini_mapping,albert_mapping,glove_mapping,stsb_mapping,mpnet_mapping,final
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...,spanish coach facing action in race row:madrid...,1,1,3,1,1,...,5,1,3,2,3,3,3,1,3,3
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w...","bruce lee statue for divided city:in bosnia, w...",1,1,4,2,5,...,5,1,5,5,5,5,3,1,5,5
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...,only lovers left alive's tilda swinton talks a...,2,1,1,5,2,...,4,1,2,1,1,1,1,1,1,1
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...,macromedia contributes to ebay stores:macromed...,3,3,2,0,0,...,2,4,4,4,4,4,4,4,4,4
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...,qualcomm plans to phone it in on cellular repa...,2,3,2,0,0,...,4,4,4,4,4,4,4,4,4,4


In [59]:
!gdown --id 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Downloading...
From: https://drive.google.com/uc?id=1wjP6suUImTJX36EBjnY9coP_RWsId1xy
To: /kaggle/working/sample_submission.csv
100%|█████████████████████████████████████████| 840k/840k [00:00<00:00, 112MB/s]


In [60]:
sample = pd.read_csv('sample_submission.csv')

In [61]:
sample['category'] = data['final'].values
sample['category'].head()

0    3
1    5
2    1
3    4
4    4
Name: category, dtype: int64

In [62]:
sample.to_csv('ensemble_submit.csv', index=False)

In [None]:
# import os
# import pathlib

In [None]:
# # Deleting the files
# files_to_delete = './*.csv' # this considers only ".txt" files. If you want to delete all files, use "./*"
# files_list = pathlib.Path(os.getcwd()).glob(files_to_delete)
# for file_path in files_list:
#     os.remove(file_path)