<a href="https://colab.research.google.com/github/vaikunthd/Opioid-Prescription-Analysis-using-Machine-Learning/blob/main/Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dataset**

The dataset has been downloaded from [2] kaggle where it originates from cms.gov. There are three files of which prescriber-info.csv displays the number of opiate and non-opiate drugs prescribed by 25,000 unique medical professionals in the United States in the year 2014 for the Medicare Beneficiaries under class D medicare. It also has some information about the professionals themselves such as their NPI number, gender, state, credentials and specialty. A doctor is labelled as opiate prescriber if they prescribe opiate drugs more than 10 times based on the claim counts by the patients in class D Medicare.
The overdoses.csv file contains information about the population and death count for all states in the USA. And the opioids.csv contains a list of drugs that fall under the opiate category.

Dataset Link: https://www.kaggle.com/datasets/apryor6/us-opiate-prescriptions


# **Data Exploration**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
!pip install chart_studio
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

Collecting chart_studio
  Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting retrying>=1.3.3 (from chart_studio)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, chart_studio
Successfully installed chart_studio-1.1.0 retrying-1.3.4


In [None]:
import chart_studio
chart_studio.tools.set_credentials_file(username='VD05', api_key='V4AqQZUpZ37s85E69MXK')
# set the credentials

In [None]:
df = pd.read_csv("/content/sample_data/prescriber-info.csv")
# read the prescriber info data file
overdose_df = pd.read_csv("/content/sample_data/overdoses.csv")
# read the overdoses file
df.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,ABILIFY,ACETAMINOPHEN.CODEINE,ACYCLOVIR,ADVAIR.DISKUS,AGGRENOX,...,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE,Opioid.Prescriber
0,1710982582,M,TX,DDS,Dentist,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1245278100,F,AL,MD,General Surgery,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35,1
2,1427182161,F,NY,M.D.,General Practice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,25,0
3,1669567541,M,AZ,MD,Internal Medicine,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1679650949,M,NV,M.D.,Hematology/Oncology,0,0,0,0,0,...,0,0,0,0,17,28,0,0,0,1


# **Functions used**

In [None]:
def bar_chart_trace(x, y, color, name, orientation = 'v'):
    """
    This function helps to create the trace for the bar chart
    and returns it.
    Arguments:
    1. x: The data in x-axis.
    2. y: The data in y-axis.
    3. color: The color to be used for plotting.
    4. name: The name of the trace.
    5. orientation: The orientation of the bar graph
    Returns:
    1. The trace of the bar graph.
    """

    trace = go.Bar(
                    x = x,
                    y = y,
                    marker = dict(color = color),
                    name = name,
                    orientation = orientation)
    return trace

# **Gender and states of prescriber**

In [None]:
gender_count = df['Gender'].value_counts()
# get the gender count distribution
text = (gender_count*100/sum(gender_count)).values.astype('str')
# text to be added in graph
trace = bar_chart_trace(x = gender_count.index, y = gender_count.values, name = 'Gender Population',
                        color = ['rgba(24, 57, 126, 1)', 'rgba(88, 167, 26, 1)'])
# get the trace of the genders
data = [trace]
# data to be plotted
layout = go.Layout(
    title='Overall gender distribution',
    xaxis=dict(
        title = 'Gender',
        tickmode = 'array',
        titlefont = dict(size = 14, color = 'rgba(0, 0, 0, 1)'),
        ticktext = ['Male', 'Female'],
        tickvals = [0, 1],
        tickfont=dict(
            size=14,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    yaxis=dict(
        title='Frequency',
        titlefont=dict(
            size=14,
            color='rgba(0, 0, 0, 1)'
        ),
        tickfont=dict(
            size=12,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    plot_bgcolor = 'rgba(150, 150, 150, 0.03)'
)
# define the layouts for the plot
annotations = []
# empty list to define the annotations
for i in range(2):
    # iterate over two times
    annotations.append(dict(x = gender_count.index[i], y = gender_count.values[i]/2, text = text[i] + '%',
                           font = dict(size = 25, color = 'rgba(255,255,255,1)'), showarrow = False))
layout['annotations'] = annotations
# add annotations
fig = go.Figure(data=data, layout = layout)
py.iplot(fig, filename='gender_count_bar')

In [None]:
gender = pd.DataFrame(index = sorted(df['State'].unique()))
# empty dataframe

In [None]:
gender['Male'] = df[df['Gender'] == 'M']['State'].value_counts()
# get the male count in each state
gender['Female'] = df[df['Gender'] == 'F']['State'].value_counts()
# get the female count in each state
gender['Total'] = df['State'].value_counts()
# total gender counts
gender['Male Percentage'] = gender['Male']*100/gender['Total']
gender['Female Percentage'] = gender['Female']*100/gender['Total']
# get the male and female percentage
gender['Female/Male'] = gender['Female']/gender['Male']
# get the female to male ratio
gender.sort_values(by = 'Total', ascending = True, inplace = True)
# sort based on the number of prescribers
gender.head(n = 10)
# top 10 data

Unnamed: 0,Male,Female,Total,Male Percentage,Female Percentage,Female/Male
AA,1.0,,1,100.0,,
GU,2.0,,2,100.0,,
ZZ,2.0,,2,100.0,,
AE,1.0,1.0,2,50.0,50.0,1.0
VI,,3.0,3,,100.0,
WY,31.0,7.0,38,81.578947,18.421053,0.225806
AK,23.0,16.0,39,58.974359,41.025641,0.695652
VT,38.0,27.0,65,58.461538,41.538462,0.710526
ND,41.0,25.0,66,62.121212,37.878788,0.609756
MT,50.0,27.0,77,64.935065,35.064935,0.54


In [None]:
trace1 = bar_chart_trace(x = gender.index, y = gender['Male'].values, name = 'Male',
                        color = 'rgba(24, 57, 126, 1)', orientation = 'v')
# get the trace1 of the bar graph
trace2 = bar_chart_trace(x = gender.index, y = gender['Female'].values, name = 'Female',
                        color = 'rgba(88, 167, 26, 1)', orientation = 'v')
# get the trace2 of the bar graph
data = [trace1, trace2]
# data to be plotted
layout = go.Layout(
    title='Male and Female prescribers across various states',
    xaxis=dict(
        title = 'States and territories',
        titlefont = dict(size = 14, color = 'rgba(107, 107, 107, 1)'),
        tickangle = -60,
        tickfont=dict(
            size=10,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    yaxis=dict(
        title='Frequency',
        titlefont=dict(
            size=14,
            color='rgba(0, 0, 0, 1)'
        ),
        tickfont=dict(
            size=12,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    legend=dict(
        x = 0.0,
        y = 1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.25,
    bargroupgap=0.15,
    plot_bgcolor = 'rgba(150, 150, 150, 0.03)'
)

fig = go.Figure(data=data, layout = layout)
py.iplot(fig, filename='gender_bar')

# **Class distribution**

In [None]:
class_count = df['Opioid.Prescriber'].value_counts()
# get the gender count distribution
text = (class_count*100/sum(class_count)).values.astype('str')
# text to be added in graph
trace = bar_chart_trace(x = class_count.index, y = class_count.values, name = 'Class distribution',
                        color = ['rgba(24, 57, 126, 1)', 'rgba(88, 167, 26, 1)'])
# get the trace of the genders
data = [trace]
# data to be plotted
layout = go.Layout(
    title='Class Distribution',
    xaxis=dict(
        title = 'Opioid prescriber',
        tickmode = 'array',
        titlefont = dict(size = 14, color = 'rgba(0, 0, 0, 1)'),
        #ticktext = ['Male', 'Female'],
        tickvals = [0, 1],
        tickfont=dict(
            size=14,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    yaxis=dict(
        title='Frequency',
        titlefont=dict(
            size=14,
            color='rgba(0, 0, 0, 1)'
        ),
        tickfont=dict(
            size=12,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    plot_bgcolor = 'rgba(150, 150, 150, 0.03)'
)
# define the layouts for the plot
annotations = []
# empty list to define the annotations
for i in range(2):
    # iterate over two times
    annotations.append(dict(x = class_count.index[i], y = class_count.values[i]/2, text = text[i] + '%',
                           font = dict(size = 25, color = 'rgba(255,255,255,1)'), showarrow = False))
layout['annotations'] = annotations
# add annotations
fig = go.Figure(data=data, layout = layout)
py.iplot(fig, filename='class_count_bar')

In [None]:
class_dist = pd.DataFrame(index = sorted(df['State'].unique()))
# empty dataframe
#print(df[df['Opioid.Prescriber'] == 1]['State'].value_counts())
class_dist['1'] = df[df['Opioid.Prescriber'] == 1]['State'].value_counts()
# get the male count in each state
class_dist['0'] = df[df['Opioid.Prescriber'] == 0]['State'].value_counts()
# get the female count in each state
class_dist['Total'] = df['State'].value_counts()
# total gender counts
class_dist['1 Percentage'] = class_dist['1']*100/class_dist['Total']
class_dist['0 Percentage'] = class_dist['0']*100/class_dist['Total']
# get the male and female percentage
class_dist['0/1'] = class_dist['0']/class_dist['1']
# get the female to male ratio
class_dist.sort_values(by = 'Total', ascending = True, inplace = True)
# sort based on the number of prescribers
class_dist.head(n = 10)
# top 10 data

Unnamed: 0,1,0,Total,1 Percentage,0 Percentage,0/1
AA,,1.0,1,,100.0,
GU,1.0,1.0,2,50.0,50.0,1.0
ZZ,1.0,1.0,2,50.0,50.0,1.0
AE,2.0,,2,100.0,,
VI,3.0,,3,100.0,,
WY,24.0,14.0,38,63.157895,36.842105,0.583333
AK,27.0,12.0,39,69.230769,30.769231,0.444444
VT,40.0,25.0,65,61.538462,38.461538,0.625
ND,39.0,27.0,66,59.090909,40.909091,0.692308
MT,55.0,22.0,77,71.428571,28.571429,0.4


In [None]:
trace1 = bar_chart_trace(x = class_dist.index, y = class_dist['1'].values, name = 'Prescriber',
                        color = 'rgba(24, 57, 126, 1)', orientation = 'v')
# get the trace1 of the bar graph
trace2 = bar_chart_trace(x = class_dist.index, y = class_dist['0'].values, name = 'Non-Prescriber',
                        color = 'rgba(88, 167, 26, 1)', orientation = 'v')
# get the trace2 of the bar graph
data = [trace1, trace2]
# data to be plotted
layout = go.Layout(
    title='Opioid and non-opioid prescribers across all states',
    xaxis=dict(
        title = 'States and territories',
        titlefont = dict(size = 14, color = 'rgba(107, 107, 107, 1)'),
        tickangle = -60,
        tickfont=dict(
            size=10,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    yaxis=dict(
        title='Frequency',
        titlefont=dict(
            size=14,
            color='rgba(0, 0, 0, 1)'
        ),
        tickfont=dict(
            size=12,
            color='rgba(0, 0, 0, 1)'
        )
    ),
    legend=dict(
        x = 0.0,
        y = 1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.25,
    bargroupgap=0.15,
    plot_bgcolor = 'rgba(150, 150, 150, 0.03)'
)

fig = go.Figure(data=data, layout = layout)
py.iplot(fig, filename='class_bar_states')