In [2]:
import numpy as np 
import pandas as pd

In [3]:
import nltk   # Natural language processing tool-kti
import textwrap  # Use to format and manipulate text by adjusting paras
from nltk import word_tokenize 
from nltk.tokenize.treebank import TreebankWordDetokenizer # Used to convert tokenized sentences into original string..
# Detokenization required to reformulate the article

In [4]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reliance\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df = pd.read_csv("bbc-text.csv")

In [6]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [7]:
labels = set(df['category'])
print(labels) # to see which kind of labels we have and we are going to train one of these label.

print()
print("Another method(Sudarshan's Method) to check labels:")
df['category'].unique()

{'politics', 'tech', 'entertainment', 'sport', 'business'}

Another method(Sudarshan's Method) to check labels:


array(['tech', 'business', 'sport', 'entertainment', 'politics'],
      dtype=object)

In [8]:
# I am Specifically Interested in Making article spinner for Business
txts = df[df['category'] == "business"].reset_index()['text']


In [9]:
# Making the combination prev and next word with different middle word
# storing them in a dict [ key: (w(t-1),w(t+1)) , value: {w(t) :count(wt)}] 
# Remeber that value is a dictionary
probs = {}
for doc in txts:
    tokens = word_tokenize(doc)
    for i in range(len(tokens)-2):
        t_0 = tokens[i]
        t_1 = tokens[i+1]
        t_2 = tokens[i+2]
        
        # Adding prev and next element into the key
        key = (t_0,t_2)
        
        if key not in probs:
            probs[key] = {} # Making entry for the (toupled Key)
        
        # count for the middle word
        if t_1 not in probs[key]:
            probs[key][t_1] = 1
        else:
            probs[key][t_1]+=1
            
# Now Normalizing the probabilities
for key,d in probs.items():
    # d represents a distribution of prev and next word
    total = sum(d.values())
    
    for k,v in d.items():
        d[k] = v / total


In [10]:
# Note: there are many words that appears only ones in middle so we are not interested to change them
# Because it does'nt make any sense

In [25]:
def spin_document(doc):
    spinned_doc = spin_doc(doc)
    return spinned_doc

In [33]:
detokenizer = TreebankWordDetokenizer()
# Actually some times it might get failed means the task of Detokenizer is to regroup the words by giving
# space between them and by not giving space for punctuations. sometimes it forgets to do this things

In [13]:
# Checking How detokenizer works
print(detokenizer.detokenize(word_tokenize(txts[0])))

worldcom boss left books alone former worldcom boss bernie ebbers who is accused of overseeing an $11bn (£5.8bn) fraud never made accounting decisions a witness has told jurors . david myers made the comments under questioning by defence lawyers who have been arguing that mr ebbers was not responsible for worldcom s problems . the phone company collapsed in 2002 and prosecutors claim that losses were hidden to protect the firm s shares . mr myers has already pleaded guilty to fraud and is assisting prosecutors . on monday defence lawyer reid weingarten tried to distance his client from the allegations . during cross examination he asked mr myers if he ever knew mr ebbers make an accounting decision . not that i am aware of mr myers replied . did you ever know mr ebbers to make an accounting entry into worldcom books mr weingarten pressed . no replied the witness . mr myers has admitted that he ordered false accounting entries at the request of former worldcom chief financial officer sc

In [16]:
# selecting random word from our probability distribution
def sample_word(d):
    p0 = np.random.random()
    cumulative = 0
    for w , p in d.items():
        cumulative +=p
        if  p0 < cumulative:
            return w
    assert(False)

In [23]:
# Main function to spine document
def spin_doc(doc):
    tokens = word_tokenize(doc)
    output = [tokens[0]]
    i= 0
    while i < (len(tokens)-2) :
        t_0 = tokens[i]
        t_1 = tokens[i+1] # we need to replace
        t_2 = tokens[i+2]
        
        key = (t_0,t_2)
        p_dist = probs[key]  # Retriving the probability dist for the combination
        
        
        if len(p_dist) > 1 and np.random.random() < 0.3:
            middle = sample_word(p_dist)
            output.append(t_1) # appending the original word
            output.append("<" + middle + ">")
            output.append(t_2)
            
            # We dont want to replace the third token since middle is depend on the third
            # so we will repalce words in the bunch of 3
            i+=2
            
        else:
            output.append(t_1)
            i+=1
                
    # Last word to add
    if i == len(tokens) -2:
            output.append(tokens[-1])
            
    return detokenizer.detokenize(output)

In [31]:
# Randomly choosing document for spinning
i = np.random.choice(txts.shape[0])
new_doc = spin_document(txts.iloc[i])

In [32]:
# Using textwrap our text does not go off the screen
print(textwrap.fill(new_doc , replace_whitespace = False , fix_sentence_endings = True))

giant waves damage s asia <asia> economy governments aid agencies
insurers and travel <aerospace> firms are among those counting the
cost <number> of the massive earthquake <trade> and waves that
hammered southern asia . <and> the worst-hit areas are sri lanka india
indonesia and thailand <talks> with at <not> least 23 000 people
killed . early estimates from <that> the world bank put <is> the
amount <light> of aid needed at about $5bn (£2.6bn) similar to the
cash offered central america after hurricane mitch . mitch killed
about 10 000 people and caused damage of about $10bn in 1998. world
bank spokesman damien milverton told the wall <high> street journal
that he expected an aid package of financing and debt relief . tourism
<it> is a vital part of <in> the economies of the stricken countries
providing jobs for 19 million people <people> in the <the> south east
asian region according <close> to the world <holiday> travel and
tourism council (wttc <pml>). in the maldives islands in th