In [1]:
from collections import defaultdict

In [2]:
wordsFile = open('general/words.txt')
words = []
for word in wordsFile:
    words.append(word.split()[0])

In [3]:
# creating dict with every category's nGram count distribution
def generate_categories_dict():
    categories = defaultdict()
    for category in ['general', 'business', 'entertainment', 'politics', 'sport', 'tech']:
        ngrams = []
        for n in ['uni', 'bi', 'tri', 'four', 'five']:
            if category == 'general':
                filename = f'{category}/{n}grams.txt'
            else:
                filename = f'{category}/{category}_{n}grams.txt'

            file = open(filename)

            ngramVector = []
            for l in file:
                if n == 'uni':
                    ngramVector.append(int(l))
                else:
                    ngramVector.append([int(x) for x in l.split()])
            file.close()

            # precomputing probabilities for unigrams
            if n == 'uni':
                uniSum = sum(ngramVector)
                ngramVector = [x/uniSum for x in ngramVector]

            ngrams.append(ngramVector)
        categories[category] = ngrams
    return categories

In [4]:
def probsUnigram(categories, category):
    # Returns the category's unigram probabilities for every word
    return categories[category][0]

In [5]:
def probsNGram(evidence, categories, category, words):
    # Returns the category's nGram probabilities for every nGrams
    evidenceWords = evidence.split()
    n = len(evidenceWords)
    if n > 4:
        n = 4
        evidenceWords = evidenceWords[:n]
    if any(word not in words for word in evidenceWords):
        return [['', 0]]
    corpus = categories[category][n]
    counts = [[words[v[n]], v[n+1]] for v in corpus if [words.index(e) for e in evidenceWords] == v[:n]]
    countsSum = sum([v[1] for v in counts])
    probabilities = [[v[0], v[1]/countsSum] for v in counts]
    return probabilities

In [6]:
def probsNGram(evidence, categories, category, words):
    # Returns the category's nGram probabilities for every nGrams
    evidenceWords = evidence.split()
    n = len(evidenceWords)
    if n > 4:
        n = 4
        evidenceWords = evidenceWords[-n:]
    corpus = categories[category][n]
    try:
        counts = [[words[v[n]], v[n+1]] for v in corpus if [words.index(e) for e in evidenceWords] == v[:n]]
    except ValueError:
        return [['', 0]]
    countsSum = sum([v[1] for v in counts])
    probabilities = [[v[0], v[1]/countsSum] for v in counts]
    return probabilities


In [7]:
def unigramProbability(word, probVector, words):
    # Returns the category's unigram probability for a specific word
    if word in words:
        return probVector[words.index(word)]
    else:
        return 0

In [8]:
def nGramProbability(word, probVector):
    # Returns the category's nGram probability for a specific nGram
    prob = [v[1] for v in probVector if v[0] == word]
    if len(prob) == 0:
        return 0
    else:
        return prob[0]

In [9]:
def mixedProb(word, words, uniDist, nGramsDists, lambdas):
    # Returs the mixed probability considering all nGrams
    mixed = lambdas[0]*unigramProbability(word, uniDist, words)
    for i in range(0, len(nGramsDists)):
        mixed += lambdas[i+1]*nGramProbability(word, nGramsDists[i])
    return mixed

In [10]:
def predictNextWord(evidence, categories, category, probsUni, lambdas=[0.2]*5):
    # Predicts the next word given a reference text
    evidenceWords = evidence.split()
    n = len(evidenceWords)
    if n > 4:
        n = 4
        evidenceWords = evidenceWords[-n:]
    sumRelevantLambdas = sum(lambdas[:n+1])
    normLambda = [x/sumRelevantLambdas for x in lambdas[:n+1]]
    nGramsDists = []
    
    for i in range(1, n+1):
        newEvidence = ' '.join(evidenceWords[-i:])
        nGramsDists.append(probsNGram(newEvidence, categories, category, words))
    
    
    probabilities = []
    for word in words:
        mixed = mixedProb(word, words, probsUni, nGramsDists, normLambda)
        probabilities.append([word, mixed])
    
    
    probabilities = sorted(probabilities, key = lambda x:-x[1])
    return probabilities[0][0]

In [11]:
def getAllPredictions(evidence, categories, category, probsUni, lambdas=[0.2]*5):
    # Returns the 3 most probable fivegrams
    recommendedWords = []
    newEvidence = evidence
    for i in range(0, 5):
        newWord = predictNextWord(newEvidence, categories, category, \
                            probsUnigram(categories, category))

        recommendedWords.append(newWord)
        print(newWord)
        newEvidence = ' '.join(newEvidence.split()[-3:] + [newWord])
    return recommendedWords

In [12]:
def predictCategory(evidence, categories, words):
    categoriesV = ['business', 'entertainment', 'politics', 'sport', 'tech']
    categoriesProbs = [0]*5
    categoriesCounts = [0]*5
    for word in evidence.split():
        for i in range(1, len(categories)):
            uniProbs = probsUnigram(categories, categoriesV[i-1])
            wordProb = unigramProbability(word, uniProbs, words)
            if wordProb != 0:
                categoriesCounts[i-1] += 1
                categoriesProbs[i-1] += wordProb
    

    for i in range(0, len(categoriesProbs)):
        if categoriesCounts[i] != 0:
            categoriesProbs[i] *= categoriesCounts[i]

    return categoriesV[categoriesProbs.index(max(categoriesProbs))]

In [128]:
[v for v in categories['sport'][1] if v[0] == 3 and v[1] == 3]

[[3, 3, 1]]

In [134]:
%%time
predictNextWord('during the game the', categories, 'sport', probsUnigram(categories, 'sport'))

Wall time: 54.5 s


['football', 'set', 'crowd']

In [157]:
%%time 
getAllPredictions('game', categories, 'sport', probsUnigram(categories, 'sport'))

the
atmosphere
in
the
city
Wall time: 8min 8s


['the', 'atmosphere', 'in', 'the', 'city']

In [159]:
v = ['the', 'and', 'of']
[' '.join(v[:v.index(word)+1]) for word in v]

['the', 'the and', 'the and of']

In [13]:
categories = generate_categories_dict()

In [15]:
for cat in ['business', 'entertainment', 'politics', 'sport', 'tech']:
    pred = getAllPredictions('it is unclear why', categories, cat, probsUnigram(categories, cat))
    print((cat, pred))

the
three
had
been
detained
('business', ['the', 'three', 'had', 'been', 'detained'])
not
music
we
already
have
('entertainment', ['not', 'music', 'we', 'already', 'have'])
the
government
was
considering
scrapping
('politics', ['the', 'government', 'was', 'considering', 'scrapping'])
i
am
in
this
situation
('sport', ['i', 'am', 'in', 'this', 'situation'])
the
blocking
was
taking
place
('tech', ['the', 'blocking', 'was', 'taking', 'place'])


In [16]:
for cat in ['business', 'entertainment', 'politics', 'sport', 'tech']:
    pred = getAllPredictions('the obvious conclusion is', categories, cat, probsUnigram(categories, cat))
    print((cat, pred))

the
latest
in
a
series
('business', ['the', 'latest', 'in', 'a', 'series'])
the
first
time
in
the
('entertainment', ['the', 'first', 'time', 'in', 'the'])
the
the
original
text
was
('politics', ['the', 'the', 'original', 'text', 'was'])
a
good
performance
he
is
('sport', ['a', 'good', 'performance', 'he', 'is'])
the
first
time
you
can
('tech', ['the', 'first', 'time', 'you', 'can'])


In [None]:
data.text