In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
# Plotly libraries
import plotly
import plotly.express as px
import plotly.graph_objs as go


# Measuring run time
from time import time

# Text preprocessing/analysis
import nltk
import re, random
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spellchecker import SpellChecker

from math import log2

In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/title_conference.csv')
data.shape                                        

(2507, 2)

In [4]:
data.head()

Unnamed: 0,Title,Conference
0,Innovation in Database Management: Computer Sc...,VLDB
1,High performance prime field multiplication fo...,ISCAS
2,enchanted scissors: a scissor interface for su...,SIGGRAPH
3,Detection of channel degradation attack by Int...,INFOCOM
4,Pinning a Complex Network through the Betweenn...,ISCAS


# Data Preprocessing

In [4]:
def lemmatize_text(text):
    tokeniser = RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokeniser.tokenize(text)
    # 2. POS tagging
    pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
    pos_tags = pos_tag(tokens)
    print(pos_tags)
    # 3. Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    return ' '.join([lemmatiser.lemmatize(t.lower(), pos=pos_map.get(p[0], 'v'))  for t, p in pos_tags if p not in ['DT', 'IN']])

In [5]:
data['Title']=data.Title.apply(lemmatize_text)

[('Innovation', 'NN'), ('in', 'IN'), ('Database', 'NNP'), ('Management', 'NNP'), ('Computer', 'NNP'), ('Science', 'NNP'), ('vs', 'NN'), ('Engineering', 'VBG')]
[('High', 'JJ'), ('performance', 'NN'), ('prime', 'JJ'), ('field', 'NN'), ('multiplication', 'NN'), ('for', 'IN'), ('GPU', 'NNP')]
[('enchanted', 'VBN'), ('scissors', 'NNS'), ('a', 'DT'), ('scissor', 'NN'), ('interface', 'NN'), ('for', 'IN'), ('support', 'NN'), ('in', 'IN'), ('cutting', 'VBG'), ('and', 'CC'), ('interactive', 'JJ'), ('fabrication', 'NN')]
[('Detection', 'NN'), ('of', 'IN'), ('channel', 'NN'), ('degradation', 'NN'), ('attack', 'NN'), ('by', 'IN'), ('Intermediary', 'NNP'), ('Node', 'NNP'), ('in', 'IN'), ('Linear', 'NNP'), ('Networks', 'NNP')]
[('Pinning', 'VBG'), ('a', 'DT'), ('Complex', 'JJ'), ('Network', 'NNP'), ('through', 'IN'), ('the', 'DT'), ('Betweenness', 'NNP'), ('Centrality', 'NNP'), ('Strategy', 'NNP')]
[('Analysis', 'NN'), ('and', 'CC'), ('Design', 'NNP'), ('of', 'IN'), ('Memoryless', 'NNP'), ('Intercon

In [75]:
vectoriser = TfidfVectorizer(token_pattern=r'[a-z]+', min_df=5, max_df=.95, stop_words = stop_words)
vectoriser.fit_transform(data['Title'])
tokens = list(vectoriser.vocabulary_.keys())

In [76]:
def categorize_data_class_wise(data):
    classes = list(data['Conference'].unique())
    data_class_wise = {cls:data.groupby(['Conference']).get_group(cls)['Title'].tolist() for cls in data['Conference'].unique()}
    return data_class_wise

In [109]:
def marginal_count_of_word(data, word):
    return data.loc[[ word in word_set
        for word_set in (set(words) for words in
                         data['Title']
                         .str.strip()
                         .str.split())], :].shape[0]
marginal_count_of_word(data, 'database')                  

90

In [112]:
def marginal_count_of_class(data, cls, data_class_wise):
    return len(data_class_wise[cls])

In [113]:
print('ndot1: ', marginal_count_of_class(data, 'VLDB', data_class_wise))

ndot1:  423


In [80]:
def joint_count(word, cls, data_class_wise):
    count = 0
    to_match = []
    class_data =data_class_wise[cls]
    class_data = pd.DataFrame(data_class_wise[cls], columns = ['Title'])
    return marginal_count_of_word(class_data, word)

In [None]:
data_class_wise =categorize_data_class_wise(data)

In [81]:
def mutual_info(word,cls, data, data_class_wise):
    # print(word)
    mi = 0
    n = data.shape[0]
    n11 = joint_count(word,cls, data_class_wise)
    # print(n11)
    ndot1 = marginal_count_of_class(data, cls, data_class_wise)
    # print('ndot1: ', ndot1)
    n1dot = marginal_count_of_word(data, word)   
    # print('n1dot: ', n1dot)
    n10 = n1dot - n11
    # print('n10: ', n10)
    n01 = ndot1 -n11
    # print('n01: ', n01)
    # print(f'n10: {n10} and n01: {n01}')
    n00 = n - (n11 + n01 + n10)
    # print('n00: ', n00)
    n0dot = n01 + n00
    ndot0 = n10 + n00
    keys = [n11, n10, n01, n00]
    # print(keys)
    values = [[ndot1, n1dot], [n1dot, ndot0], [n0dot, ndot1], [n0dot, ndot0]]
    vals = {}                              
    for i, key in enumerate(keys):                  
        vals.setdefault(key, []).append(values[i])   
    # print(vals)
    for key in vals:
        for i in range(len(vals[key])):
            li = vals[key][i]
            den = li[0] * li[1]
            num = key * n
            if num == 0:
                mi+=0
            else:
                mi+= key/n * log2(num/den)
    return mi

In [82]:
mutual_info('database','VLDB', data, data_class_wise) #ndot1 = n01 +n11()

0.08354739627654717

In [106]:
# n11 is the number of documents that contain 'database' and are classified as 'VLDB':
df = data[data['Conference'] == 'VLDB']
df.reset_index(drop = True, inplace = True)

df = df.loc[['database' in word_set
        for word_set in (set(words) for words in
                         df['Title']
                         .str.strip()
                         .str.split())], :]
count = df.shape[0]
print('n11:',count)
n11 = count

n11: 86


In [107]:
# n10 is the number of documents that contain 'database' and are not in class 'VLDB':
df = data[data['Conference'] != 'VLDB']
df.reset_index(drop = True, inplace = True)

df = df.loc[['database' in word_set
        for word_set in (set(words) for words in
                         df['Title']
                         .str.strip()
                         .str.split())], :]
count = df.shape[0]
print('n10:',count)
n10 = count

n10: 4


In [110]:
# n01 is the number of documents classified as class 'VLDB' that does not contain  word 'database':
df = data[data['Conference'] == 'VLDB']
df.reset_index(drop = True, inplace = True)
df = df.loc[['database' not in word_set
        for word_set in (set(words) for words in
                         df['Title']
                         .str.strip()
                         .str.split())], :]
count = df.shape[0]
print('n01:',count)
n01 = count

n01: 337


In [111]:
# ndot1, count of the documents containing the class:
ndot1 = data[data['Conference']=='VLDB'].shape[0]
print('ndot1:', ndot1)

ndot1: 423


In [92]:
#n00 is the count of the documents without term 'database' and not belonging to class 'VLDB'
df = data[data['Conference'] != 'VLDB']
df.reset_index(drop = True, inplace = True)
df = df.loc[['database' not in word_set
        for word_set in (set(words) for words in
                         df['Title']
                         .str.strip()
                         .str.split())], :]
count = df.shape[0]
print('n00:',count)
n00 = count

n00: 2080


In [98]:
# n0dot is the number of document without term 'database':
count = 0
df = data.loc[['database' not in word_set
        for word_set in (set(words) for words in
                         data['Title']
                         .str.lower()
                         .str.strip()
                         .str.split())], :]
count = df.shape[0]
print('n0dot:', count)

n0dot: 2417


In [114]:
# ndot0 is the number of documents that do not belong to class 'VLDB':
print('ndot0:', data[data['Conference'] != 'VLDB'].shape[0])

ndot0: 2084


In [100]:
def calc_mutual_info(Total_words):
    classes = list(data['Conference'].unique())
    mi_data = {}
    for cls in classes:
        print(cls)
        mi_data[cls]= [mutual_info(word, cls, data, data_class_wise) for word in Total_words]
    return mi_data

In [71]:
mi_data = calc_mutual_info(tokens)

VLDB
ISCAS
SIGGRAPH
INFOCOM
WWW


In [72]:
df = pd.DataFrame(data = mi_data, index = tokens, columns = list(data['Conference'].unique()))
df.to_excel('MI_data.xlsx')

In [73]:
def select_top_k_words(k, mi_data, words):
    classes = list(data['Conference'].unique())
    df = pd.DataFrame(data = mi_data, index = words, columns = classes )
    best_k = {}
    for cls in classes:
        best_k[cls] = df[cls].sort_values(ascending = False).head(k).index.tolist()
        print(cls, best_k[cls])
    return best_k

In [74]:
best_k = select_top_k_words(20, mi_data, tokens)

VLDB ['database', 'data', 'query', 'network', 'relational', 'index', 'wireless', 'power', 'join', 'schema', 'management', 'cmos', 'processing', 'sql', 'low', 'issue', 'dbms', 'circuit', 'xml', 'top']
ISCAS ['cmos', 'low', 'web', 'power', 'circuit', 'network', 'converter', 'query', 'filter', 'amplifier', 'voltage', 'noise', 'database', 'design', 'delta', 'mode', 'sigma', 'high', 'frequency', 'data']
SIGGRAPH ['network', 'animation', 'surface', 'graphic', 'light', 'display', 'reality', 'power', 'computer', 'low', 'wireless', 'rendering', 'web', 'render', 'texture', 'image', 'panel', 'interactive', 'cmos', 'radiosity']
INFOCOM ['network', 'wireless', 'routing', 'hoc', 'packet', 'hop', 'sensor', 'ad', 'ip', 'web', 'database', 'multicast', 'optimal', 'qos', 'scheduling', 'cmos', 'schedule', 'rout', 'image', 'congestion']
WWW ['web', 'search', 'semantic', 'social', 'wireless', 'rank', 'low', 'topic', 'power', 'service', 'cmos', 'recommendation', 'information', 'user', 'news', 'engine', 'netw

In [66]:
import csv
w = csv.writer(open("SelectKBest(Mutual_Info).csv", "w"))
for key, val in best_k.items():
    w.writerow([key, val])