# Load CSV file as a dataframe

In [2]:
import pandas as pd
import numpy as py

df = pd.read_csv("DataBreaches.csv")

# Transforming the YEAR column to actual years

In [3]:
df.loc[df["YEAR"]=="0", "YEAR"] = 2004
df.loc[df["YEAR"]=="1", "YEAR"] = 2005
df.loc[df["YEAR"]=="2", "YEAR"] = 2006
df.loc[df["YEAR"]=="3", "YEAR"] = 2007
df.loc[df["YEAR"]=="4", "YEAR"] = 2008
df.loc[df["YEAR"]=="5", "YEAR"] = 2009
df.loc[df["YEAR"]=="6", "YEAR"] = 2010
df.loc[df["YEAR"]=="7", "YEAR"] = 2011
df.loc[df["YEAR"]=="8", "YEAR"] = 2012
df.loc[df["YEAR"]=="9", "YEAR"] = 2013
df.loc[df["YEAR"]=="10", "YEAR"] = 2014
df.loc[df["YEAR"]=="11", "YEAR"] = 2015
df.loc[df["YEAR"]=="12", "YEAR"] = 2016
df.loc[df["YEAR"]=="13", "YEAR"] = 2017
df.loc[df["YEAR"]=="14", "YEAR"] = 2018





# Keeping only Entity, Year, Records lost, Organization, Method of leak, & Sensitivity columns & deleting others


In [4]:
df1 = df.drop(df.columns[[1, 2, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17]], axis=1)


In [5]:
df1.columns

Index(['Entity', 'YEAR', 'records lost', 'ORGANISATION', 'METHOD OF LEAK',
       'DATA SENSITIVITY'],
      dtype='object')

# Analyzing Organization, Methods of Leak & Sensitivity columns

In [6]:
df1[['ORGANISATION','METHOD OF LEAK','DATA SENSITIVITY']]

Unnamed: 0,ORGANISATION,METHOD OF LEAK,DATA SENSITIVITY
0,,,1. Just email address/Online information 20 SS...
1,web,inside job,1
2,financial,poor security,20
3,financial,lost / stolen device,20
4,financial,lost / stolen device,300
5,financial,hacked,300
6,"tech, retail",lost / stolen device,20
7,financial,inside job,300
8,telecoms,hacked,1
9,telecoms,lost / stolen device,1


#  Tracking the errors or null values

In [7]:
df1.loc[147, :]

Entity                 OVH
YEAR                  2013
records lost           NaN
ORGANISATION           web
METHOD OF LEAK      hacked
DATA SENSITIVITY        20
Name: 147, dtype: object

In [8]:
df1.loc[163, :]

Entity              UbiSoft
YEAR                   2013
records lost            NaN
ORGANISATION         gaming
METHOD OF LEAK       hacked
DATA SENSITIVITY         20
Name: 163, dtype: object

In [27]:
# The Records Lost column has 2 errors as two rows have missing data or null values so we fill them with 0

# Formatting the columns names

In [9]:
df1.columns = ['Entity', 'Year', 'Records Lost', 'Organization', 'Method of Leak','Sensitivity']

In [10]:
df1.columns

Index(['Entity', 'Year', 'Records Lost', 'Organization', 'Method of Leak',
       'Sensitivity'],
      dtype='object')

# Changing the null values to zero

In [16]:
df1[df1.isnull()] = 0


In [17]:
df1.loc[163, :]

Entity            UbiSoft
Year                 2013
Records Lost            0
Organization       gaming
Method of Leak     hacked
Sensitivity            20
Name: 163, dtype: object

In [20]:
df1.loc[147, :]

Entity               OVH
Year                2013
Records Lost           0
Organization         web
Method of Leak    hacked
Sensitivity           20
Name: 147, dtype: object

# Saving the contents of dataframe to a new csv file

In [21]:
df1.to_excel('DatABreachesCleaned.xlsx', index=False)


# Tableau Workbook Visualization 1

In [43]:
%%HTML
<div class='tableauPlaceholder' id='viz1506755090191' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;La&#47;Lab2_100&#47;RecordsLostvsLeakageMethod&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='2' /> <param name='site_root' value='' /><param name='name' value='Lab2_100&#47;RecordsLostvsLeakageMethod' /><param name='tabs' value='yes' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;La&#47;Lab2_100&#47;RecordsLostvsLeakageMethod&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1506755090191');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [28]:
# The above Tableau visualization 1 shows the records lost as part of Data Breach through various forms of leakage methods 
# The advantages of this visual is that it accurately shows the no.of records lost using year by year and leakage method categories
# It also shows that the highest total no.of records lost from 2004 onwards under anyone category is Hacking
# It also shows that from 2015 onwards data breach has been less severe as Accidentally published records contributed more 
# towards the records lost.However, Poor Security is also starting to contribute more towards data breach from 2015 onwards
# Next version of this visual could improve or better depict this notion by using a pie chart which would describe which category
# of data breach method accounted for how many data breach records lost
# 


# Tableau Worbook Visualization 2

In [45]:
%%HTML
<div class='tableauPlaceholder' id='viz1506755167570' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;La&#47;Lab2_100&#47;PieChartDataBreachMethods&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='2' /> <param name='site_root' value='' /><param name='name' value='Lab2_100&#47;PieChartDataBreachMethods' /><param name='tabs' value='yes' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;La&#47;Lab2_100&#47;PieChartDataBreachMethods&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1506755167570');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [38]:
# The above Visualization 2 better communicates the idea and is more easily readable and noticeable in terms of identifying the 
# dominant methods and ways of Data Breach incidents. The only disadvantage i feel is that it does not show us the historical 
# trend or pattern over the years. However, the audience can easily identify that Hacking was the major method of Data Breach
# only to be followed by other methods such as Accidently published and Poor Security
# We could improve the next version by incorporating the type of organization and the no. of records lost in each type to identify
# in which business segment or industry Data Breach has been more severe
# The above visual is also very important for the audience such as the security team as they need to invest more in safeguarding
# user information since hacking has been the major tool of data breaches

# Tableau Workbook Visualization 3

In [46]:
%%HTML
<div class='tableauPlaceholder' id='viz1506755245503' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;QQ&#47;QQK4DBP6Y&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='2' /> <param name='path' value='shared&#47;QQK4DBP6Y' /> <param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;QQ&#47;QQK4DBP6Y&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1506755245503');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [40]:
# The above Tableau visualization 3 refutes the claim that "Security breaches do most commonly occur in less-tech-savvy organiza-
# tions" as the organization type Web has the highest no of records lost and the highest no. of Data Breaches incidences were
# done through Web platform. After Web, Financial organizations have been the hardest hit, however, in the recent years from 2015
# onwards, only Web is the major platform of Data Breaches.The above bar chart is very redeable and communicative of the crucial
# information regarding data breaches

# Tableau Workbook Visualization 4

In [47]:
%%HTML
<div class='tableauPlaceholder' id='viz1506755309738' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;KT&#47;KTSDC38B6&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='2' /> <param name='path' value='shared&#47;KTSDC38B6' /> <param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;KT&#47;KTSDC38B6&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1506755309738');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [48]:
# The above Tableau visualization 4 is the final visual for the intended audience. This shows that Data breaches have become more
# severe as the records lost are increasing in the recent years. In terms of data sensitivity, SSN/Personal details of users have
# been increasing infringed upon in the data breaches. This bar chart is very useful as the viewer can view complete details
# of a specific data breach incident on the bars including entity names, method of leak, organization type, sensitivity,year
# as well as the records lost