In [77]:
import re
from nltk.corpus import stopwords
import nltk
import collections
import math
from sklearn.feature_extraction.text import TfidfVectorizer

In [109]:
stemmer = nltk.stem.porter.PorterStemmer()
WORD = re.compile(r'\w+')

In [2]:
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [3]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [4]:
text = "An army soldier was injured in fierce gun battle with a group of infiltrating terrorists from across the Line of Control in Gali Maidan area of Sawjian sector, while a BSF jawan was injured in unprovoked firing by Pakistani rangers in Hiranagar sector along the international border in Kathua district on Friday.Identifying the injured army soldier as Launce Naik Vinod Kumar and the BSF jawan as Gurnam Singh, sources said that former got injured during an encounter with a group of terrorists who sneaked into Sawjian sector on the Indian side from the Pakistan occupied Kashmir during wee hours of Friday. The encounter was in progress, sources added.Significantly, the infiltration attempt from across the LoC in Sawjian sector of Poonch district came nearly 24 hours after half a dozen heavily armed terrorists attacked a BSF naka along the international border in Kathua district with small arms fire and rocket propelled grenades so as to cross over to the Indian side. The infiltration attempt was foiled by alert BSF personnel who retaliated killing one of them as during illumation of the area with the help of tracer bomb, terrorists fleeing back to Pakistan side were seen carrying a body with them, sources added.Meanwhile, a BSF jawan was injured as Pakistani Rangers continued resorting to mortar shelling and small arms fire on two forward Indian positions at Bobiyan in Hiranagar sector of Kathua district. Sources said that the fire from across the international border first came around 9.35 am and it continued for nearly 40 minutes.Thereafter, the Pakistani Rangers again resumed firing on Indian side around 12.15 noon, sources said, adding that it was continuing till reports last came in. The Indian side was also retaliating.Ever since, India carried out surgical strikes across the Line of Control causing sufficient damage to terrorists and those shielding them last month, Pakistan has been resorting to mortar shelling and small arms fire at one or the other place along the borders in Jammu region. It continued lobbing mortar shells, besides resorting to automatics and small arms fire along the LoC in Rajouri district’s Manjakote area of Bhimber Gali sector throughout Wednesday night.The Indian troops retaliated appropriately. There had been no casualty or damage on the Indian side. Pakistani troops have resorted to firing in Rajouri sector also this afternoon."

In [5]:
sentences = split_into_sentences(text)
print len(sentences)

16


In [6]:
stop = set(stopwords.words('english'))

In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [84]:
def remove_stop_words(sentences) :
    tokenized_sentences = []
    for sentence in sentences :
        tokens = []
        split = sentence.lower().split()
        for word in split :
            if word not in stop :
                try :
                    stemmer.stem(word)
                    tokens.append(word)
                except :
                    tokens.append(word)
        
        tokenized_sentences.append(tokens)
    return tokenized_sentences
        

In [85]:
tokenized_sentences = remove_stop_words(sentences)

In [13]:
def posTagger(tokenized_sentences) :
    tagged = []
    for sentence in tokenized_sentences :
        tag = nltk.pos_tag(sentence)
        tagged.append(tag)
    return tagged

In [16]:
tagged = posTagger(remove_stop_words(sentences))


In [48]:
def tfIsf(tokenized_sentences):
    scores = []
    COUNTS = []
    for sentence in tokenized_sentences :
        counts = collections.Counter(sentence)
        isf = []
        score = 0
        for word in counts.keys() :
            count_word = 1
            for sen in tokenized_sentences :
                for w in sen :
                    if word == w :
                        count_word += 1
            score = score + counts[word]*math.log(count_word-1)
        scores.append(score/len(sentence))
    return scores

In [53]:
tfIsfScore = tfIsf(tokenized_sentences)

In [50]:
def similar(tokens_a, tokens_b) :
    #Using Jaccard similarity to calculate if two sentences are similar
    ratio = len(set(tokens_a).intersection(tokens_b))/ float(len(set(tokens_a).union(tokens_b)))
    return ratio

In [54]:
def similarityScores(tokenized_sentences) :
    scores = []
    for sentence in tokenized_sentences :
        score = 0;
        for sen in tokenized_sentences :
            if sen != sentence :
                score += similar(sentence,sen)
        scores.append(score)
    return scores

In [56]:
similarityScore = similarityScores(tokenized_sentences)

In [66]:
def properNounScores(tagged) :
    scores = []
    for i in range(len(tagged)) :
        score = 0
        for j in range(len(tagged[i])) :
            if(tagged[i][j][1]== 'NNP' or tagged[i][j][1]=='NNPS') :
                score += 1
        scores.append(score/float(len(tagged[i])))
    return scores
        

In [69]:
properNounScore = properNounScores(tagged)

In [118]:
def centroidSimilarity(sentences) :
    centroidIndex = tfIsfScore.index(max(tfIsfScore))
    scores = []
    for sentence in sentences :
        vec1 = text_to_vector(sentences[centroidIndex])
        vec2 = text_to_vector(sentence)
        
        score = get_cosine(vec1,vec2)
        scores.append(score)
    return scores

In [112]:
def text_to_vector(text):
    words = WORD.findall(text)
    return collections.Counter(words)

In [113]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [119]:
centroidSimilarityScore = centroidSimilarity(sentences)

In [137]:
def numericToken(tokenized_sentences):
    scores = []
    for sentence in tokenized_sentences :
        score = 0
        for word in sentence :
            if is_number(word) :
                score +=1 
        scores.append(score/float(len(sentence)))
    return scores

In [138]:
numericTokenScore = numericToken(tokenized_sentences)

In [139]:
numericTokenScore

[0.0,
 0.0,
 0.0,
 0.02857142857142857,
 0.0,
 0.0,
 0.1,
 0.4,
 0.1111111111111111,
 0.09090909090909091,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [129]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False