## Creating the corpus using gensim

In this notebook we do the following:
* Download the 20 newsgroups corpus that will be used as input dataset
* Create a gensim representation of the 20newsgroups corpus 
* Save the results using pickle


In [1]:
# sphinx_gallery_thumbnail_number = 2
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### First, download the 20 Newsgroups dataset

In [2]:
from string import punctuation
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

newsgroups = fetch_20newsgroups()

In [3]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
data = pd.DataFrame({"text":newsgroups.data, "target": newsgroups.target})
    

In [5]:
data.head()

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


## Create the gensim representation of the corpus

In [6]:
eng_stopwords = set(stopwords.words('english'))

tokenizer = RegexpTokenizer(r'\s+', gaps=True)
stemmer = PorterStemmer()
translate_tab = {ord(p): u" " for p in punctuation}

def text2tokens(raw_text):
    """Split the raw_text string into a list of stemmed tokens."""
    clean_text = raw_text.lower().translate(translate_tab)
    tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]
    tokens = [token for token in tokens if token not in eng_stopwords]
    # stemmed_tokens = [stemmer.stem(token) for token in tokens]
    # return [token for token in stemmed_tokens if len(token) > 2]  # skip short tokens
    return [token for token in tokens if len(token) > 2]  # skip short tokens

dataset = [text2tokens(txt) for txt in list(data['text'].values)]  # convert a documents to list of tokens

from gensim.corpora import Dictionary
dictionary = Dictionary(documents=dataset, prune_at=None)
dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)  # use Dictionary to remove un-relevant tokens
dictionary.compactify()


2021-06-27 01:43:21,860 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-06-27 01:43:24,904 : INFO : adding document #10000 to Dictionary(113766 unique tokens: ['60s', '70s', 'addition', 'anyone', 'body']...)
2021-06-27 01:43:25,293 : INFO : built Dictionary(125085 unique tokens: ['60s', '70s', 'addition', 'anyone', 'body']...) from 11314 documents (total 1936447 corpus positions)
2021-06-27 01:43:25,358 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(125085 unique tokens: ['60s', '70s', 'addition', 'anyone', 'body']...) from 11314 documents (total 1936447 corpus positions)", 'datetime': '2021-06-27T01:43:25.294457', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-5.8.0-55-generic-x86_64-with-glibc2.10', 'event': 'created'}
2021-06-27 01:43:25,559 : INFO : discarding 100637 tokens: [('bricklin', 4), ('edu', 7393), ('host', 4840), ('lerxst', 2), ('lines', 11277), ('nntp', 4777), ('organization', 108

In [7]:
d2b_dataset = [dictionary.doc2bow(doc) for doc in dataset]  # convert list of tokens to bag of word representation

## Save the corpus and the dictionary

In [8]:
PATH2CORPUS = "./20newsgroup_corpus_gensim.pickle"
PATH2DICTIONARY = "./dictionary_20newsgroups_gensim.pickle"
PATH2CORPUSTEXT = "./20newsgroup_corpus_text.csv"

import pickle
# Save the corpus representation in gensim format
# and the corresponding dictionary
with open(PATH2CORPUS, 'wb') as f:
    pickle.dump(d2b_dataset, f)

with open(PATH2DICTIONARY, 'wb') as f:
    pickle.dump(dictionary, f)

In [9]:
data.to_csv(PATH2CORPUSTEXT)