In [163]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from apyori import apriori
import re
import calendar
import networkx as nx
from pyvis.network import Network

In [164]:
# Read csv file and make data frame
df = pd.read_csv(r'C:\Users\mevamsi\Documents\BITS\Data Mining\Assignment\data_mining1\data\clean_data.csv', error_bad_lines=False,low_memory=False)

In [165]:
# Data set Visualisation
df.head(5)

Unnamed: 0.1,Unnamed: 0,Patient Number,Date Announced,Age Bracket,Gender,Detected District,Detected State,Current Status,Notes,Type of transmission,Source_1,Source_2,Backup Notes,AgeGroup
0,0,1.0,30/01/2020,20.0,F,Thrissur,Kerala,Recovered,Travelled from Wuhan,Imported,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,15-24
1,1,2.0,02/02/2020,-1.0,M,Alappuzha,Kerala,Recovered,Travelled from Wuhan,Imported,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,Student from Wuhan,15-24
2,2,3.0,03/02/2020,-1.0,M,Kasaragod,Kerala,Recovered,Travelled from Wuhan,Imported,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,Student from Wuhan,15-24
3,3,4.0,02/03/2020,45.0,M,East Delhi,Delhi,Recovered,"Travelled from Austria, Italy",Imported,https://www.indiatoday.in/india/story/not-a-ja...,https://economictimes.indiatimes.com/news/poli...,Travel history to Italy and Austria,45-54
4,4,5.0,02/03/2020,24.0,M,Hyderabad,Telangana,Recovered,"Travelled from Dubai to Bangalore on 20th Feb,...",Imported,https://www.deccanherald.com/national/south/qu...,https://www.indiatoday.in/india/story/coronavi...,"Travel history to Dubai, Singapore contact",15-24


In [166]:
# Remove unnecessary columns
df.drop(['Unnamed: 0', 'Patient Number', 'Age Bracket', 'Notes', 'Backup Notes', 'Type of transmission'], axis = 1, inplace = True)

In [167]:
# visualise data frame after removing columns
df.head(5)

Unnamed: 0,Date Announced,Gender,Detected District,Detected State,Current Status,Source_1,Source_2,AgeGroup
0,30/01/2020,F,Thrissur,Kerala,Recovered,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,15-24
1,02/02/2020,M,Alappuzha,Kerala,Recovered,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,15-24
2,03/02/2020,M,Kasaragod,Kerala,Recovered,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,15-24
3,02/03/2020,M,East Delhi,Delhi,Recovered,https://www.indiatoday.in/india/story/not-a-ja...,https://economictimes.indiatimes.com/news/poli...,45-54
4,02/03/2020,M,Hyderabad,Telangana,Recovered,https://www.deccanherald.com/national/south/qu...,https://www.indiatoday.in/india/story/coronavi...,15-24


In [168]:
# Rename the columns for ease of use
df.columns = ['date', 'gender', 'district', 'state', 'status', 'source_1', 'source_2', 'age_group']
df.head(5)

Unnamed: 0,date,gender,district,state,status,source_1,source_2,age_group
0,30/01/2020,F,Thrissur,Kerala,Recovered,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,15-24
1,02/02/2020,M,Alappuzha,Kerala,Recovered,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,15-24
2,03/02/2020,M,Kasaragod,Kerala,Recovered,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,15-24
3,02/03/2020,M,East Delhi,Delhi,Recovered,https://www.indiatoday.in/india/story/not-a-ja...,https://economictimes.indiatimes.com/news/poli...,45-54
4,02/03/2020,M,Hyderabad,Telangana,Recovered,https://www.deccanherald.com/national/south/qu...,https://www.indiatoday.in/india/story/coronavi...,15-24


In [169]:
# Combine the source columns and shorten the url to domain and handle
df['source'] = df['source_1']
df['source'].fillna(df['source_2'], inplace = True)
df.drop(['source_1', 'source_2'], axis = 1, inplace = True)

def source_shortener(source):
    shortened_source = []
    for url in source:
        domain = re.findall(r'^(?:.*:\/\/)?(?:www\.)?([^:\/]*).*$', str(url), re.M|re.I)[0]
        if len(domain) <= 0:
            domain = 'unknown'
        if re.search(r'(twitter|facebook)', domain):
            handle = re.findall(r'(?<=\.com\/)(.*?)(?=\/)', url)
            domain = re.sub('.com', '', domain)
            if len(handle) > 0:
                domain += ' @' + handle[0]
            else:
                domain += ' unknown'
        shortened_source.append(domain)
    return shortened_source

df['source'] = source_shortener(df['source'])

In [170]:
# Convert dates to month
df['date'] = pd.to_datetime(df['date'])
df['month'] = pd.DatetimeIndex(df['date']).month_name()
df.drop(['date'], axis = 1, inplace = True)

In [171]:
# Skimming through the data for outliers
for i in df.columns:
    print(df[i].unique().tolist())

['F', 'M', 'Other']
['Thrissur', 'Alappuzha', 'Kasaragod', 'East Delhi', 'Hyderabad', 'Italians', 'Agra', 'South West Delhi', 'Ghaziabad', 'West Delhi', 'Leh', 'Kancheepuram', 'Pathanamthitta', 'North Delhi', 'Jammu', 'Bengaluru Urban', 'Kannur', 'Pune', 'Amritsar', 'Ernakulam', 'Kottayam', 'Jaipur', 'Mumbai', 'Nagpur', 'S.P.S. Nellore', 'Lucknow', 'Kalaburagi', 'New Delhi', 'Thiruvananthapuram', 'Ahmednagar', 'Thane', 'Raigad', 'Yavatmal', 'Idukki', 'Aurangabad', 'Dehradun', 'Malappuram', 'Khordha', 'South Delhi', 'Gurugram', 'Kargil', 'Mahe', 'Gautam Buddha Nagar', 'Kolkata', 'Chennai', 'Ratnagiri', 'Srinagar', 'Jhunjhunu', 'North East Delhi', 'Chandigarh', 'Kodagu', 'Prakasam', 'Raipur', 'Lakhimpur Kheri', 'Shahid Bhagat Singh Nagar', 'Rajkot', 'Surat', 'Visakhapatnam', 'North West Delhi', 'S.A.S. Nagar', 'Ahmedabad', 'Vadodara', 'Bhilwara', 'Faridabad', 'Panipat', 'Kangra', 'Jabalpur', 'Karimnagar', nan, 'North 24 Parganas', 'Chikkaballapura', 'Panchkula', 'Gandhinagar', 'Moradabad

In [172]:
# District has value italians, so remove it
df['district'].replace(to_replace = ['Italians'], value = np.NaN)

0            Thrissur
1           Alappuzha
2           Kasaragod
3          East Delhi
4           Hyderabad
             ...     
99930         Pulwama
99931           Reasi
99932        Kishtwar
99933     East Sikkim
99934    South Sikkim
Name: district, Length: 99935, dtype: object

In [173]:
# Visualise null column values
for i in df.columns:
    print(i, df[i].isnull().sum(), sep = "\t")

gender	0
district	7720
state	0
status	0
age_group	0
source	0
month	0


In [174]:
# Fill the district null values with the mode value of the state
districtstatemap = {i : df[df['state'] == i].mode().district[0] for i in df['state'].unique().tolist()}
df['district'] = df['district'].fillna(df['state'].map(districtstatemap))
print(df['district'].isnull().sum())

0


In [175]:
df.head(5)

Unnamed: 0,gender,district,state,status,age_group,source,month
0,F,Thrissur,Kerala,Recovered,15-24,twitter @vijayanpinarayi,January
1,M,Alappuzha,Kerala,Recovered,15-24,indiatoday.in,February
2,M,Kasaragod,Kerala,Recovered,15-24,indiatoday.in,March
3,M,East Delhi,Delhi,Recovered,45-54,indiatoday.in,February
4,M,Hyderabad,Telangana,Recovered,15-24,deccanherald.com,February


In [176]:
# Convert all the values to nested lists
records = df.values.tolist()
print(records[:10])

[['F', 'Thrissur', 'Kerala', 'Recovered', '15-24', 'twitter @vijayanpinarayi', 'January'], ['M', 'Alappuzha', 'Kerala', 'Recovered', '15-24', 'indiatoday.in', 'February'], ['M', 'Kasaragod', 'Kerala', 'Recovered', '15-24', 'indiatoday.in', 'March'], ['M', 'East Delhi', 'Delhi', 'Recovered', '45-54', 'indiatoday.in', 'February'], ['M', 'Hyderabad', 'Telangana', 'Recovered', '15-24', 'deccanherald.com', 'February'], ['M', 'Italians', 'Rajasthan', 'Recovered', '65-74', 'indianexpress.com', 'March'], ['M', 'Italians', 'Haryana', 'Recovered', '55-64', 'indianexpress.com', 'April'], ['M', 'Italians', 'Haryana', 'Recovered', '55-64', 'indianexpress.com', 'April'], ['M', 'Italians', 'Haryana', 'Recovered', '55-64', 'indianexpress.com', 'April'], ['M', 'Italians', 'Haryana', 'Recovered', '55-64', 'indianexpress.com', 'April']]


In [177]:
# Applying apriori
association_rules = apriori(records, min_support=0.022, min_confidence=0.2, min_lift=2, min_length=2)
association_results = list(association_rules)
print(len(association_results))

194


In [178]:
# Convert apriori's frozen sets to nested lists for easy interpretation
rules = []
for i in association_results:
    rule = []
    #base
    rule.append(list(i[2][0][0]))
    #antecedent
    rule.append((list(i[2][0][1])))
    # support
    rule.append(str(i[1]))
    # confidence
    rule.append(str(i[2][0][2]))
    # lift
    rule.append(str(i[2][0][3]))
    rules.append(rule)

In [179]:
# Printing/Visuaising the rules
for i in rules:
    base = ", ".join(i[0])
    antecedents = ", ".join(i[1])
    print("Rule:       " + base + " -> " + antecedents)
    print("Support:    " + i[2])
    print("Confidence: " + i[3]) 
    print("Lift:       " + i[4]) 
    print("=====================================")

Rule:       Ahmedabad -> Gujarat
Support:    0.027077600440286186
Confidence: 1.0
Lift:       17.668847241867045
Rule:       Gujarat -> April
Support:    0.030349727322759795
Confidence: 0.5362446958981613
Lift:       2.2783730999780087
Rule:       April -> Maharashtra
Support:    0.0647420823535298
Confidence: 0.2750733387185919
Lift:       2.50222593344643
Rule:       April -> Mumbai
Support:    0.05269425126332116
Confidence: 0.22388503890140726
Lift:       3.3508988112344067
Rule:       April -> twitter @ANI
Support:    0.08727672987441837
Confidence: 0.3708175672803027
Lift:       2.4228606463652858
Rule:       Bihar -> twitter @BiharHealthDept
Support:    0.06106969530194627
Confidence: 0.7581366459627329
Lift:       12.414285714285713
Rule:       Chennai -> May
Support:    0.10297693500775504
Confidence: 0.6800370052203792
Lift:       2.0829859043921597
Rule:       Chennai -> Tamil Nadu
Support:    0.15142842847851104
Confidence: 1.0
Lift:       4.092845148871688
Rule:       Che

In [182]:
# Visualising via network graphs
G = nx.DiGraph()
rulecount = 0
for i in rules:
    G.add_nodes_from([('R' + str(rulecount), {"color": "red"})])
    for j in i[0]:
        G.add_nodes_from([(j, {"color": "yellow"})])
        G.add_edges_from([(j, 'R' + str(rulecount))])
    for j in i[1]:
        G.add_nodes_from([(j, {"color": "green"})])
        G.add_edges_from([('R' + str(rulecount), j)])
    rulecount += 1
net = Network(notebook = True)
net.from_nx(G)
net.show('arm.html')