# Imports

In [1]:
import chart_studio.plotly as py
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

# Plotting Veracity Value Counts

In [15]:
df = pd.read_csv('politifact_clean.csv')
df.head()

Unnamed: 0,statement,source,link,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,/web/20180705082623/https://www.politifact.com...,Pants on Fire!
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,/web/20180705082623/https://www.politifact.com...,Mostly True
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,/web/20180705082623/https://www.politifact.com...,Pants on Fire!
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,/web/20180705082623/https://www.politifact.com...,Pants on Fire!
4,"""George H.W. Bush has died at 94.""",Bloggers,/web/20180705082623/https://www.politifact.com...,Pants on Fire!


In [17]:
df.veracity.value_counts(normalize=True)

Mostly True       0.240883
False             0.236056
Mostly False      0.208706
True              0.192885
Pants on Fire!    0.121469
Name: veracity, dtype: float64

In [21]:
fig = go.Figure()
fig.update_layout(title={
    'text': 'Veracity Distribution in Dataset',
    'y':0.92,
    'x':0.45,
    'yanchor':'top',
    'xanchor':'center'
},
    title_font_size=20,)

fig.add_trace(go.Pie(labels=['Mostly True', 'Pants on Fire!', 'False', 'True', 'Mostly False'], values=df['veracity'].value_counts(), textinfo='label+percent',))


fig.show()

# Pie-chart veracity clean binarized

In [41]:
df = pd.read_csv('politifact_clean_binarized.csv')
df.head()

Unnamed: 0,statement,source,link,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,/web/20180705082623/https://www.politifact.com...,0
1,"Says Ronald Reagan said immigrants ""brought wi...",Becoming American Initiative,/web/20180705082623/https://www.politifact.com...,1
2,"Says Democratic Senators ""demand Supreme Court...",Viral image,/web/20180705082623/https://www.politifact.com...,0
3,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,/web/20180705082623/https://www.politifact.com...,0
4,"""George H.W. Bush has died at 94.""",Bloggers,/web/20180705082623/https://www.politifact.com...,0


In [43]:
fig = go.Figure()
fig.update_layout(title={
    'text': 'Clean Binarized Dataset Veracity Distribution',
    'y':0.92,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_font_size=20,)

fig.add_trace(go.Pie(labels=['Lies', 'Truth'], values=df['veracity'].value_counts(), textinfo='label+percent', marker_colors=['#FFA15A', '#636EFA'],))


fig.show()

# Pie-chart strict binarized

In [44]:
df = pd.read_csv('politifact_strict_binarized.csv')
df.head()

Unnamed: 0,statement,source,link,veracity
0,"Sen. Kamala Harris is ""supporting the animals ...",Donald Trump,/web/20180705082623/https://www.politifact.com...,0
1,"Says Democratic Senators ""demand Supreme Court...",Viral image,/web/20180705082623/https://www.politifact.com...,0
2,"""Tim Kaine doesn’t want a border at all. He wa...",Corey Stewart,/web/20180705082623/https://www.politifact.com...,0
3,"""George H.W. Bush has died at 94.""",Bloggers,/web/20180705082623/https://www.politifact.com...,0
4,"""The deficit ... is coming down, and it’s comi...",Larry Kudlow,/web/20180705082623/https://www.politifact.com...,0


In [45]:
fig = go.Figure()
fig.update_layout(title={
    'text': 'Strict Binarized Dataset Veracity Distribution',
    'y':0.92,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_font_size=20,)

fig.add_trace(go.Pie(labels=['Lies', 'Truth'], values=df['veracity'].value_counts(), textinfo='label+percent', marker_colors=['#FFA15A', '#636EFA'],))


fig.show()

# Plotting General Statistics

### Reading in data

In [46]:
df = pd.read_csv('politifact_ner_automatic_pos_tag_counts.csv')
df.head()

Unnamed: 0,source,veracity,date,lemmas,numb_of_letters,numb_of_words,avg_word_len,read_score_gun,pos_tags,count_of_adjective,...,count_of_punctuation,freq_of_punctuation,count_of_space,freq_of_space,count_of_symbol,freq_of_symbol,count_of_verb,freq_of_verb,count_of_unknown,freq_of_unknown
0,Donald Trump,0,2018-07-03,kamala harris support animal,56,9,5.3,10.0,"[('Sen.', 'PROPN'), ('Kamala', 'PROPN'), ('Har...",0,...,3,0.333333,0,0.0,0,0.0,1,0.111111,0,0.0
1,Becoming American Initiative,1,2018-07-03,ronald reagan immigrant bring courage value fa...,173,30,4.8,7.3,"[('Says', 'VERB'), ('Ronald', 'PROPN'), ('Reag...",2,...,6,0.2,1,0.033333,0,0.0,6,0.2,0,0.0
2,Viral image,0,2018-07-03,democratic senators demand supreme court nomin...,102,14,6.4,16.7,"[('Says', 'VERB'), ('Democratic', 'ADJ'), ('Se...",1,...,3,0.214286,0,0.0,0,0.0,3,0.214286,0,0.0
3,Corey Stewart,0,2018-07-03,tim kaine want border want rid immigration cus...,141,25,4.7,9.8,"[('""', 'PUNCT'), ('Tim', 'PROPN'), ('Kaine', '...",0,...,4,0.16,0,0.0,0,0.0,3,0.12,0,0.0
4,Bloggers,0,2018-07-02,george bush die,34,7,4.0,0.9,"[('""', 'PUNCT'), ('George', 'PROPN'), ('H.W.',...",0,...,3,0.428571,0,0.0,0,0.0,1,0.142857,0,0.0


In [47]:
fig = go.Figure()
fig.update_layout(title={
    'text': 'Statements Veracity Distribution in Dataset',
    'y':0.92,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_font_size=20,)

fig.add_trace(go.Pie(labels=['Lies', 'Truth'], values=df['veracity'].value_counts(), textinfo='label+percent', marker_colors=['#FFA15A', '#636EFA'],))


fig.show()

## Histogram Number Words

https://plotly.com/python/histograms/

In [48]:
truths = df[df['veracity'] == 1]
lies = df[df['veracity'] == 0] 

In [49]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=lies['numb_of_words'], name='Lies', marker_color='#FFA15A',
                            xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=50.0,
                                        )
                          ),
)

fig.add_trace(go.Histogram(x=truths['numb_of_words'], name='Truths', marker_color='#636EFA', opacity=0.8,
                          xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=50.0,
                                       )
                          ),
)

fig.update_layout(barmode='overlay')
fig.update_layout(xaxis_title="Number of Words")                           
fig.update_layout(yaxis_title="Count")
fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Distribution of Number of Words in Truths vs. Lies',
    title_font_size=20,
    bargap=0.05)
                           
fig.show()

## Histogram Number of Adjectives

In [50]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=lies['count_of_adjective'], name='Lies', marker_color='#FFA15A',
                            xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=5.0,
                                        )
                          ),
)

fig.add_trace(go.Histogram(x=truths['count_of_adjective'], name='Truths', marker_color='#636EFA', opacity=0.8,
                          xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=5.0,
                                       )
                          ),
)

fig.update_layout(barmode='overlay')
fig.update_layout(xaxis_title="Number of Adjectives")                           
fig.update_layout(yaxis_title="Count")
fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Distribution of Adjective Count in Truths vs. Lies',
    title_font_size=20,
    bargap=0.05)
                           
fig.show()

## Histogram Number of Nouns

In [51]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=lies['count_of_noun'], name='Lies', marker_color='#FFA15A',
                            xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=15.0,
                                        )
                          ),
)

fig.add_trace(go.Histogram(x=truths['count_of_noun'], name='Truths', marker_color='#636EFA', opacity=0.8,
                          xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=15.0,
                                       )
                          ),
)

fig.update_layout(barmode='overlay')
fig.update_layout(xaxis_title="Number of Nouns")                           
fig.update_layout(yaxis_title="Count")
fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Distribution of Nouns in Truths vs. Lies',
    title_font_size=20,
    bargap=0.05)
                           
fig.show()

## Histogram Number of Verbs

In [52]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=lies['count_of_verb'], name='Lies', marker_color='#FFA15A',
                            xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=9.0,
                                        )
                          ),
)

fig.add_trace(go.Histogram(x=truths['count_of_verb'], name='Truths', marker_color='#636EFA', opacity=0.8,
                          xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=9.0,
                                       )
                          ),
)

fig.update_layout(barmode='overlay')
fig.update_layout(xaxis_title="Number of Verbs")                           
fig.update_layout(yaxis_title="Count")
fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Distribution of Verbs in Truths vs. Lies',
    title_font_size=20,
    bargap=0.05)
                           
fig.show()

# Histogram Readablity Score (Gunn)

In [53]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=lies['read_score_gun'], name='Lies', marker_color='#FFA15A',
                            xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=15.0,
                                        )
                          ),
)

fig.add_trace(go.Histogram(x=truths['read_score_gun'], name='Truths', marker_color='#636EFA', opacity=0.8,
                          xbins=dict( # bins used for histogram
                                        start=-0.1,
                                        end=15.0,
                                       )
                          ),
)

fig.update_layout(barmode='overlay')
fig.update_layout(xaxis_title="Number of Nouns")                           
fig.update_layout(yaxis_title="Count")
fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Distribution of Readability Score in Truths vs. Lies',
    title_font_size=20,
    bargap=0.05)
                           
fig.show()

# Ideas how to display POS_NER_SD Stats

In [13]:
import plotly.graph_objects as go

years = [1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
         2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]

fig = go.Figure()
fig.add_trace(go.Bar(x=years,
                y=[219, 146, 112, 127, 124, 180, 236, 207, 236, 263,
                   350, 430, 474, 526, 488, 537, 500, 439],
                name='Rest of world',
                marker_color='rgb(55, 83, 109)'
                ))
fig.add_trace(go.Bar(x=years,
                y=[16, 13, 10, 11, 28, 37, 43, 55, 56, 88, 105, 156, 270,
                   299, 340, 403, 549, 499],
                name='China',
                marker_color='rgb(26, 118, 255)'
                ))

fig.update_layout(
    title='US Export of Plastic Scrap',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='USD (millions)',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

# POS General Stats

In [22]:
pos_df = pd.read_excel('results/pos_general_tags_stats.xlsx')
diff = []
for i, row in pos_df.iterrows():
    diff.append(np.abs(row['t_means'] - row['f_means']))
    
pos_df['diff'] = diff
pos_df = pos_df.sort_values(by=['diff'], ascending=False)
pos_df

Unnamed: 0,tag,t_means,f_means,p_value,diff
9,proper noun,2.02,2.51,0.0,0.49
6,noun,4.35,3.96,0.0,0.39
7,number,0.96,0.64,0.0,0.32
0,adjective,1.37,1.13,0.0,0.24
1,conjunction or particle or adverb,2.27,2.04,0.0,0.23
11,verb,2.28,2.48,0.0,0.2
4,determiner,1.83,1.64,0.0,0.19
2,adverb,0.7,0.59,0.0,0.11
8,possessive,0.49,0.6,0.0,0.11
3,coordinating conjuction,0.37,0.33,0.004,0.04


In [23]:
# Sorted by biggest difference of means (averages)

fig = go.Figure()
# Truth
fig.add_trace(go.Bar(
    x=pos_df['tag'].tolist(),
    y=pos_df['t_means'].tolist(),
    name='Truths',
    marker_color='rgb(26, 118, 255)',
    text=pos_df['t_means'].tolist()
))

fig.add_trace(go.Bar(
    x=pos_df['tag'].tolist(),
    y=pos_df['f_means'].tolist(),
    name='Lies',
    marker_color='lightsalmon',
    text=pos_df['f_means'].tolist()
))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Averages (Means)',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)

fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='General Part of Speech Tagging Distribution in Truths vs. Lies',
    title_font_size=20,
    height=650,)
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_traces(textposition='outside')
fig.show()  
    

# POS Specific Stats

In [24]:
poss_df = pd.read_excel('results/pos_specific_tags_stats.xlsx')
diff = []
for i, row in poss_df.iterrows():
    diff.append(np.abs(row['t_means'] - row['f_means']))
    
poss_df['diff'] = diff
poss_df = poss_df.sort_values(by=['diff'], ascending=False)
poss_df = poss_df[:15]
poss_df 

Unnamed: 0,tag,t_means,f_means,p_value,diff
15,"noun, proper singular",1.92,2.41,0.0,0.49
9,"conjunction, subordinating or preposition",2.59,2.24,0.0,0.35
6,cardinal number,0.96,0.64,0.0,0.32
14,"noun, singular or mass",3.09,2.83,0.0,0.26
7,determiner,1.72,1.53,0.0,0.19
23,"verb, base form",0.47,0.61,0.0,0.14
16,"noun, plural",1.29,1.16,0.0,0.13
10,adjective,1.1,0.98,0.0,0.12
27,"verb, non-3rd person singular present",0.48,0.38,0.0,0.1
11,"adjective, comparative",0.18,0.09,0.0,0.09


In [25]:
# Sorted by biggest difference of means (averages)

fig = go.Figure()
# Truth
fig.add_trace(go.Bar(
    x=poss_df['tag'].tolist(),
    y=poss_df['t_means'].tolist(),
    name='Truths',
    marker_color='rgb(26, 118, 255)',
    text=poss_df['t_means'].tolist()
))

fig.add_trace(go.Bar(
    x=poss_df['tag'].tolist(),
    y=poss_df['f_means'].tolist(),
    name='Lies',
    marker_color='lightsalmon',
    text=poss_df['f_means'].tolist()
))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Averages (Means)',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)

fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Specific Part of Speech Tagging Distribution in Truths vs. Lies',
    title_font_size=20,
    height=600,)
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_traces(textposition='outside')
fig.show()  
    

# NER Stats

In [26]:
ner_df = pd.read_excel('results/ner_stats.xlsx')
diff = []
for i, row in ner_df.iterrows():
    diff.append(np.abs(row['t_means'] - row['f_means']))
    
ner_df['diff'] = diff
ner_df = ner_df.sort_values(by=['diff'], ascending=False)
ner_df

Unnamed: 0,tag,t_means,f_means,p_value,diff
0,people,0.35,0.51,0.0,0.16
4,dates,0.32,0.19,0.0,0.13
8,numerals,0.31,0.2,0.0,0.11
1,organizations,0.27,0.34,0.0,0.07
5,percentages,0.16,0.09,0.0,0.07
2,regions,0.44,0.39,0.0,0.05
6,money,0.15,0.12,0.004,0.03
7,ordinality,0.05,0.03,0.0,0.02
3,art,0.01,0.02,0.0,0.01


In [27]:
# Sorted by biggest difference of means (averages)

fig = go.Figure()
# Truth
fig.add_trace(go.Bar(
    x=ner_df['tag'].tolist(),
    y=ner_df['t_means'].tolist(),
    name='Truths',
    marker_color='rgb(26, 118, 255)',
    text=ner_df['t_means'].tolist()
))

fig.add_trace(go.Bar(
    x=ner_df['tag'].tolist(),
    y=ner_df['f_means'].tolist(),
    name='Lies',
    marker_color='lightsalmon',
    text=ner_df['f_means'].tolist()
))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Averages (Means)',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)

fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Named Entity Recognition Distribution in Truths vs. Lies',
    title_font_size=20,
    height=600)
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_traces(textposition='outside')
fig.show()  

# SD Stats

In [28]:
sd_df = pd.read_excel('results/sd_stats.xlsx')
diff = []
for i, row in sd_df.iterrows():
    diff.append(np.abs(row['t_means'] - row['f_means']))
    
sd_df['diff'] = diff
sd_df = sd_df.sort_values(by=['diff'], ascending=False)
sd_df

Unnamed: 0,dependency,t_means,f_means,p_value,diff
7,compound,1.76,2.01,0.0,0.25
10,numeric modifier,0.58,0.37,0.0,0.21
2,adjectival modifier,1.23,1.03,0.0,0.2
9,determiner,1.64,1.46,0.0,0.18
1,adverbial modifier,0.69,0.57,0.0,0.12
6,clausal complement,0.37,0.49,0.0,0.12
3,auxiliary,0.71,0.82,0.0,0.11
11,open clausal complement,0.16,0.21,0.0,0.05
8,conjunct,0.39,0.36,0.034,0.03
4,case marking,0.12,0.15,0.0,0.03


In [29]:
# Sorted by biggest difference of means (averages)

fig = go.Figure()
# Truth
fig.add_trace(go.Bar(
    x=sd_df['dependency'].tolist(),
    y=sd_df['t_means'].tolist(),
    name='Truths',
    marker_color='rgb(26, 118, 255)',
    text=sd_df['t_means'].tolist()
))

fig.add_trace(go.Bar(
    x=sd_df['dependency'].tolist(),
    y=sd_df['f_means'].tolist(),
    name='Lies',
    marker_color='lightsalmon',
    text=sd_df['f_means'].tolist()
))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Averages (Means)',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1, # gap between bars of the same location coordinate.
)

fig.update_layout(title={
    'y':0.9,
    'x':0.5,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Syntactic Dependency Distribution in Truths vs. Lies',
    title_font_size=20,
    height=600)
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
# fig.data[1].text = sd_df['diff'].tolist()
fig.update_traces(textposition='outside')
fig.show()  

# Machine Learning Results

In [30]:
ml_df = pd.read_excel('results/ml_LR_MNB_TFIDF_clean_binarized.xlsx')
ml_df = ml_df.sort_values(by=['Accuracy'], ascending=False)

# print(ml_df.Model.unique())


models = []
acc = []
err = []
for i, row in ml_df.iterrows():
    if row['Model'] not in models:
        models.append(row['Model'])
        acc.append(row['Accuracy'])
        err.append(np.abs(row['ROC_AUC (on test data)']))

models = models[::-1]
acc = acc[::-1]
err = err[::-1]
# print(models)
# print(acc)
# print(err)

ml_df

Unnamed: 0,Model,N-gram range,N_folds (classifiers),Normalization,ROC_AUC (on test data),Accuracy
12,"Logistic Regression, TFIDF Vectorizer + N-gram",2.0,3,Light,0.7,0.66
14,"Logistic Regression, TFIDF Vectorizer + N-gram",3.0,3,Light,0.71,0.66
0,Logistic Regression (BOW),,3,Light,0.69,0.65
1,MNB (BOW),,3,Light,0.68,0.65
4,Logistic Regression (N-gram),2.0,3,Light,0.7,0.65
5,MNB (N-gram),2.0,3,Light,0.69,0.65
6,Logistic Regression (N-gram),3.0,3,Light,0.7,0.65
7,MNB (N-gram),3.0,3,Light,0.69,0.65
10,Logistic Regression (N-gram),3.0,3,Full,0.68,0.65
2,Logistic Regression (BOW),,3,Full,0.67,0.64


In [31]:
fig = go.Figure()

fig.add_trace(go.Scatter(
        x=models,
        y=acc,
        text=acc,
        mode="lines+markers+text",
        marker_color='rgb(26, 118, 255)',
        name='Validation Accuracy',
        textposition='bottom center',
    ))
fig.add_trace(go.Scatter(
        x=models,
        y=err,
        text=err,
        mode="lines+markers+text",
        marker_color='rgb(255, 118, 16)',
        name='ROC_AUC',
        textposition='top center',
    ))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Accuracy and ROC_AUC',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    bargap=0.15, # gap between bars of adjacent location coordinates.
)

fig.update_layout(title={
    'y':0.92,
    'x':0.45,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Machine Learning Models Performances',
    title_font_size=20,
    height=750,)
# fig.update_traces(textposition='top right')
# fig.update_layout(xaxis_tickangle=-45)
fig.show()

# AutoML AutoKeras Models Performance

In [32]:
ak_df = pd.read_excel('results/ml_autokeras_clean_binarized.xlsx')
ak_df = ak_df.sort_values(by=['Val_acc (test)'], ascending=False)
ak_df.head()

Unnamed: 0,Model,max_trials,Max epochs,Loss (train),Accuracy (train),Val_loss (test),Val_acc (test)
2,Text Classifier,15,3,0.5661,0.7128,0.65,0.6463
0,Text Classifier,1,2,0.5983,0.6839,0.99,0.63
3,Text Classifier + Light Normalization,2,2,0.59,0.6899,0.68,0.584
4,Text Classifier + Full Normalization,2,2,0.488,0.77,0.863,0.56
1,Text Classifier,2,5,0.2457,0.9,1.23,0.55


In [33]:
models_ak = []
acc = []
err = []

for i, row in ak_df.iterrows():
    if row['Model'] not in models_ak:
        models_ak.append(row['Model'])
        acc.append(float(row['Val_acc (test)']))
        err.append(float(row['Val_loss (test)']))
        
models_ak = models_ak[::-1]
acc = acc[::-1]
err = err[::-1]

print(models_ak)
print(acc)
print(err)


['Text Classifier + Full Normalization', 'Text Classifier + Light Normalization', 'Text Classifier']
[0.56, 0.584, 0.6463]
[0.863, 0.68, 0.65]


In [34]:
fig = go.Figure()

fig.add_trace(go.Scatter(
        x=models_ak,
        y=acc,
        text=acc,
        mode="lines+markers+text",
        marker_color='rgb(26, 118, 255)',
        name='Validation Accuracy',
        textposition='bottom center',
    ))
fig.add_trace(go.Scatter(
        x=models_ak,
        y=err,
        text=err,
        mode="lines+markers+text",
        marker_color='rgb(255, 118, 16)',
        name='Validation Loss',
        textposition='top center',
    ))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Accuracy and Loss',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    bargap=0.15, # gap between bars of adjacent location coordinates.
)

fig.update_layout(title={
    'y':0.92,
    'x':0.45,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='AutoML AutoKeras Models Performances',
    title_font_size=20,
    height=750,)
# fig.update_traces(textposition='top right')
fig.show()

# Deep Learning Models Performance

## Politifact Binarized Clean (11118 rows)

In [35]:
dl_df = pd.read_excel('results/dl_clean_binarized.xlsx')
dl_df = dl_df.sort_values(by=['Accuracy'], ascending=False)
dl_df

Unnamed: 0,Model,Max Seq len,Batch size,Lr,epochs,Val loss,Train loss,Accuracy
1,bert-base-uncased,50,48,4e-05,30,0.648,0.651,0.63
7,distilroberta-base,50,48,4e-05,30,0.66,0.664,0.63
0,bert-base-uncased,50,64,1e-05,50,0.657,0.656,0.62
2,bert-base-cased,50,48,4e-05,30,0.664,0.661,0.62
8,albert-base-v2,50,48,4e-05,30,0.651,0.649,0.61
3,bert-large-uncased,50,48,4e-05,30,0.667,0.672,0.59
4,bert-large-cased,50,48,4e-05,30,0.67,0.674,0.58
6,roberta-large,50,48,4e-05,30,0.675,0.677,0.58
5,roberta-base,50,48,4e-05,30,0.677,0.677,0.56


In [36]:
models_dl = []
acc_dl = []
err_dl = []

for i, row in dl_df.iterrows():
    if row['Model'] not in models_dl:
        models_dl.append(row['Model'])
        acc_dl.append(float(row['Accuracy']))
        err_dl.append(float(row['Val loss']))
# print(models_dl)
# print(acc_dl)
# print(err_dl)

models_dl = models_dl[::-1]
acc_dl = acc_dl[::-1]
err_dl = err_dl[::-1]

In [37]:
fig = go.Figure()

fig.add_trace(go.Scatter(
        x=models_dl,
        y=acc_dl,
        text=acc_dl,
        mode="lines+markers+text",
        marker_color='rgb(26, 118, 255)',
        name='Validation Accuracy',
        textposition='bottom center',
    ))
fig.add_trace(go.Scatter(
        x=models_dl,
        y=err_dl,
        text=err_dl,
        mode="lines+markers+text",
        marker_color='rgb(255, 118, 16)',
        name='Validation Loss',
        textposition='top center',
    ))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Accuracy and Loss',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    bargap=0.15, # gap between bars of adjacent location coordinates.
)

fig.update_layout(title={
    'y':0.92,
    'x':0.45,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Deep-Learning Models Performances ',
    title_font_size=20,
    height=750,)
# fig.update_traces(textposition='top right')
fig.update_layout(xaxis_tickangle=-45)
fig.show()

## Politifact Strict Binarized (6k rows)

In [38]:
dls_df = pd.read_excel('results/dl_strict_binarized.xlsx')
dls_df = dls_df.sort_values(by=['Accuracy'], ascending=False)
dls_df

Unnamed: 0,Model,Max Seq len,Batch size,Lr,epochs,Val loss,Train loss,Accuracy
6,roberta-base,50,48,4e-05,30,0.685,0.685,0.68
9,roberta-large,50,48,4e-05,30,0.679,0.685,0.67
10,distilroberta-base,50,48,4e-05,30,0.673,0.677,0.66
2,bert-base-uncased,50,48,4e-05,100,0.615,0.635,0.65
7,roberta-base,50,48,5e-06,50,0.691,0.69,0.65
11,distilroberta-base,50,48,1e-05,50,0.684,0.684,0.64
8,roberta-base,50,48,1e-06,100,0.692,0.692,0.63
4,bert-large-uncased,50,48,4e-05,30,0.667,0.668,0.61
0,bert-base-uncased,50,64,1e-05,50,0.65,0.659,0.6
1,bert-base-uncased,50,48,4e-05,30,0.636,0.647,0.6


In [39]:
models_dls = []
acc_dls = []
err_dls = []

for i, row in dls_df.iterrows():
    if row['Model'] not in models_dls:
        models_dls.append(row['Model'])
        acc_dls.append(float(row['Accuracy']))
        err_dls.append(float(row['Val loss']))
# print(models_dl)
# print(acc_dl)
# print(err_dl)

models_dls = models_dls[::-1]
acc_dls = acc_dls[::-1]
err_dls = err_dls[::-1]

In [40]:
fig = go.Figure()

fig.add_trace(go.Scatter(
        x=models_dls,
        y=acc_dls,
        text=acc_dls,
        mode="lines+markers+text",
        marker_color='rgb(26, 118, 255)',
        name='Validation Accuracy',
        textposition='bottom center',
    ))
fig.add_trace(go.Scatter(
        x=models_dls,
        y=err_dls,
        text=err_dls,
        mode="lines+markers+text",
        marker_color='rgb(255, 118, 16)',
        name='Validation Loss',
        textposition='top center',
    ))

fig.update_layout(
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Accuracy and Loss',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=1.0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    bargap=0.15, # gap between bars of adjacent location coordinates.
)

fig.update_layout(title={
    'y':0.92,
    'x':0.45,
    'yanchor':'top',
    'xanchor':'center'
},
    title_text='Deep-Learning Models Performances (Strict Dataset)',
    title_font_size=20,
    height=750,)
# fig.update_traces(textposition='top right')
fig.show()