In [2]:
import pandas as pd
import plotly.express as px

In [3]:
df = pd.read_csv('data\British_Airway_Review_cleaned.csv')

In [4]:
df.info()
# no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   date               2500 non-null   object
 1   country            2500 non-null   object
 2   seat_type          2500 non-null   object
 3   recommended        2500 non-null   object
 4   stars              2500 non-null   int64 
 5   type_of_traveller  2500 non-null   object
 6   ym                 2500 non-null   object
 7   day                2500 non-null   int64 
 8   month              2500 non-null   int64 
 9   year               2500 non-null   int64 
 10  from               2500 non-null   object
 11  to                 2500 non-null   object
 12  verified           2500 non-null   object
 13  contains_prefix    2500 non-null   object
 14  cleaned_reviews    2500 non-null   object
dtypes: int64(4), object(11)
memory usage: 293.1+ KB


In [5]:
# Convert the 'date' column to datetime type
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [6]:
# colors = {'coolblack' : '#01295C',
#           'persianblue' : '#075AAA',
#           'beaublue' : '#B9CFED',
#           'pigmentred' : '#EB2226',
#           'metallicsilver' : '#A7A9AC',
#           'notwhite' : '#EFE9E5'
#           }

In [7]:
    rev_num = df.groupby(['ym', 'seat_type']).size().reset_index(name='count')
    fig_rev_num = px.bar(rev_num, x = "ym", y = "count",
                        color = 'seat_type',
                        color_discrete_map = {'Business Class': '#075AAA', 'Economy Class': '#A7A9AC',
                                              'First Class': '#B9CFED', 'Premium Economy' : '#01295C' }
                        )
    fig_rev_num.update_layout(hovermode="x",
                    title = "Exploring the Volume of Reviews",
                    title_x = 0.5,
                    yaxis_title = 'Number of Reviews',
                    xaxis_title = 'Date',
                    legend_title = None,
                    legend = dict(yanchor = "top", y = 0.98,
                              xanchor = "right", x = 0.99),
                    margin = dict(l = 50, r = 40, t = 50, b = 30)
                    )

In [8]:
df.stars.unique()

array([5, 3, 1, 9, 7], dtype=int64)

In [9]:
df['stars'].value_counts()

stars
3    1137
5     682
1     227
9     227
7     227
Name: count, dtype: int64

In [10]:
fig = px.violin(df, y="stars", color = "recommended", violinmode = 'overlay')
fig.show()

In [11]:
df.groupby(['stars', 'recommended']).size().reset_index(name='count')

Unnamed: 0,stars,recommended,count
0,1,No,157
1,1,Yes,70
2,3,No,734
3,3,Yes,403
4,5,No,435
5,5,Yes,247
6,7,No,150
7,7,Yes,77
8,9,No,144
9,9,Yes,83


In [12]:
# avg_star_ym = round(df.groupby(['ym'])['stars'].mean(),2).reset_index()
# avg_star_ym

# fig_avg_star_ym = px.line(
#     avg_star_ym,
#     x = "ym",
#     y = "stars"
# )
# fig_avg_star_ym.update_traces(mode="markers+lines", hovertemplate=None)
# fig_avg_star_ym.update_layout(hovermode="x",
#                   title = "What is the average rating?",
#                   title_x = 0.5,
#                   yaxis_title = 'Average Number of Stars',
#                   xaxis_title = 'Date'
#                   )

In [13]:
# round(avg_star_ym.stars.mean(),2)

In [50]:
recc = df.groupby(['ym', 'recommended']).size().reset_index(name='count')

fig_recc = px.line(
    recc,
    x = "ym",
    y = "count",
    color = 'recommended',
    color_discrete_map = {'No': 'red', 'Yes': 'green'}
)
fig_recc.update_layout(hovermode="x",
                    title = "Recommendation Overtime",
                    title_x = 0.5,
                    yaxis_title = 'Number of Reviews',
                    xaxis_title = 'Date',
                    legend_title = None,
                    legend = dict(yanchor = "top", y = 0.98,
                              xanchor = "right", x = 0.99),
                    margin = dict(l = 50, r = 40, t = 50, b = 30)
                    )


fig_recc

In [15]:
fig_seattype = px.pie(df, names = "seat_type", color = "seat_type",
                color_discrete_map = {'Business Class': '#075AAA', 'Economy Class': '#A7A9AC',
                                      'First Class': '#B9CFED', 'Premium Economy' : '#01295C' })
fig_seattype.update_layout(title = "By Seat Type", 
                               title_x = 0.5,
                               margin = dict(l = 50, r = 40, t = 50, b = 30)
                               )

In [49]:
fig_travellertype = px.sunburst(df, path = ['type_of_traveller','recommended'],
                                color = 'recommended',
                                color_discrete_map = {'yes': 'green', 'no': 'red'
                                }
                                )
fig_travellertype.update_layout(title = "By Travel Routes",
                            title_x = 0.5,
                            margin = dict(l = 50, r = 40, t = 50, b = 30)
)
fig_travellertype.update_traces(textinfo="label+percent parent")

In [17]:
df_country = df.groupby(['country']).size().reset_index(name='count')
df_c_index = df_country['count'].nlargest(10)

In [18]:
top10_counts = df_country[df_country.index.isin(df_c_index.index)]
not_top10_counts = df_country[~df_country.index.isin(df_c_index.index)]

In [19]:
len(not_top10_counts.country.unique())

57

In [20]:
not_top10_counts['count'].sum()

315

In [21]:
"there is a total of {} nationalities which are not in the top 10 nationalities and amounted to a count of {}".format(len(not_top10_counts.country.unique()), not_top10_counts['count'].sum())

'there is a total of 57 nationalities which are not in the top 10 nationalities and amounted to a count of 315'

In [22]:
px.pie(top10_counts, 
                        names = 'country',
                        values = 'count',
                        color_discrete_sequence = px.colors.sequential.RdBu)

In [48]:
fig_country = px.bar(top10_counts, x = 'country', y = 'count',
                        color = 'country',
                        color_discrete_sequence = px.colors.sequential.ice,
                        text_auto = True)
fig_country.update_layout(hovermode="x",
                        title = "By Top 10 Countries",
                        title_x = 0.5,
                        yaxis_title = 'Number of Reviews',
                        xaxis_title = None,
                        xaxis = {'categoryorder':'total descending'},
                        showlegend = False,
                        margin = dict(l = 50, r = 40, t = 50, b = 30)
                        )
fig_country.update_traces(textposition="outside")

In [None]:
fig_route = px.sunburst(df, path = ['from','to'], color_discrete_sequence = px.colors.sequential.RdBu)
fig_route.update_layout(title = "By Travel Routes",
                            title_x = 0.5,
                            margin = dict(l = 50, r = 40, t = 50, b = 30)
)
fig_route.update_traces(textinfo="label+percent parent")


In [None]:
# df[(df['stars'] == 1) & (df['recommended'] == 'yes')].cleaned_reviews.head(10)