In [1]:
import mysql.connector
import pandas as pd

In [2]:
def read_data(): # connects to mysql db and reads the data
    cnx = mysql.connector.connect(user='root', 
                                  host='localhost', auth_plugin='mysql_native_password', database='big_data')

    mycursor = cnx.cursor()
    mycursor.execute("SELECT * FROM article_summary")

    records = mycursor.fetchall()  # fetches the records from the cursor as a list
    df = pd.DataFrame(records, columns=['index','index2', 'Snapshot', 'Article', 'Article Vector Centroid', 'Article Topics Distributions',
                                        'TextStat Fleisch Reading Difficulty', 'Eigenvector Centrality', 'Louvain Community',
                                        'Clicks in month', 'Article Length', 'Target Complexity'])
    df = df.drop(columns=['index','index2'])
    cnx.close()
    return df


In [3]:
# df = read_data()
df = pd.read_csv("article_summary-202001.csv")

In [4]:
year = '2020'
month = '01'
date = int(year+month)
df = df.loc[df['Snapshot'] == date] # filter data based on monthly date

In [5]:
df['Article Topics Distribution'] = df['Article Topics Distributions'].apply(lambda x:x[1:-1].split(','))
df = df.drop(['Article Topics Distributions'],axis=1)
new_data = pd.DataFrame(df['Article Topics Distribution'].values.tolist(), index= df.index).add_prefix('Article Topics Distribution_')
df = pd.concat([df, new_data[:]], axis=1)

In [6]:
df = df[df['Article Topics Distribution'].apply(len).gt(2)]
filter_col = [col for col in df if col.startswith('Article Topics Distribution')]
filter_col.remove('Article Topics Distribution')
filter_col.insert(0,'Article')
df = df[filter_col]
for col in list(df.columns):
    if ('Article Topics Distribution_' in col):
        df[col] = df[col].astype(float)

df.rename(columns = {'Article Topics Distribution_0':'T1', 'Article Topics Distribution_1':'T2', 'Article Topics Distribution_2':'T3',
                    'Article Topics Distribution_3': 'T4', 'Article Topics Distribution_4': 'T5'}, inplace = True)

df.fillna(0, inplace=True) #replace nan values with 0
df.loc[:,'Total'] = df.sum(axis=1)
df.loc['Total', 1:] = df.sum(axis=0)
df.fillna('', inplace=True)
df_cols = df.columns[1:-1]

In [7]:
# Article count
new_df = df[df_cols]
new_df['Topics'] = new_df.idxmax(axis=1)
new_df = new_df.groupby('Topics')['Topics'].size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
#calculate the proportion
proportion = df.iloc[-1, 1:-1].divide(df.iloc[-1,-1], axis = 'rows').rename("Proportion")

In [9]:
proportion = proportion.values.tolist()
article_count = new_df.values.tolist()
Topics = df_cols.tolist()
result = {'Topics':Topics, 'Article_count':article_count, 'Proportion':proportion}
result

{'Topics': ['T1', 'T2', 'T3', 'T4', 'T5'],
 'Article_count': [4569, 2918, 634, 2281, 4715],
 'Proportion': [0.2907672541792734,
  0.191825218459174,
  0.060465153668220396,
  0.15187901049005287,
  0.30506336320327937]}

In [16]:
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots()
# ax.axis('equal')
# mypie, _ = ax.pie(article_count,radius=2.4, labels=proportion)
# plt.setp(mypie, width=0.8, edgecolor='white')

# # Second Ring (Inside)
# mypie2, _ = ax.pie(article_count, radius=2.4-0.6, labels=article_count, labeldistance=0.8)
# plt.setp(mypie2, width=0.6, edgecolor='white')
# plt.margins(0,0)

# # Second Ring (Inside)
# mypie3, _ = ax.pie(article_count, radius=2.4-1.2, labels=Topics, labeldistance=0.7)
# plt.setp(mypie3, width=0.6, edgecolor='white')
# plt.margins(0,0)


# plt.show()

In [17]:
import plotly.express as px

fig = px.sunburst(result, path=['Topics', 'Article_count'], values='Proportion',
                  color='Proportion', hover_data=['Article_count'],
                  color_continuous_scale='Peach', width=700, height=400)

fig.update_layout(hoverlabel_font_color='rgb(0,0,0)', title_text='Wikipedia Page Summary')
fig.show()

In [12]:
import plotly.graph_objects as go

# rgb(255, 127, 80)
# 'rgb(158,202,225)'
# marker_line_color='rgb(8,48,107)'
fig = go.Figure(data=[go.Bar(x=Topics, y=article_count)])

fig.update_traces(marker_color='lightsalmon', marker_line_color='indianred', width =0.5,
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(xaxis_title= 'Topics', yaxis_title="Number of Topics" , title_text='Wikipedia Page Summary')
fig.show()