In [1]:
# Libraries

import pandas as pd
import numpy as np
import matplotlib as plt
import re

## Harvard mass mobilization 1990 - 2019

**About the dataset**

Source: https://massmobilization.github.io

Protest is a dichotomous coding for whether or not there was a protest action in a particular period
The project defines a protest as a gathering of 50 or more people to make a demand of the government. A
protest action must be targeted at the state or state policy. The project does not code protests in one
country that are targeted at the policies of another country and in that sense, it captures only “home
grown” protest activities targeted at state policies.

We are interested in mobilization of anti-state protest, not necessarily community disputes between
various groups, and therefore we do not code inter-communal demonstrations. 

Protests that involve an industrial enterprise that is affected by state policy
such as labor rights or safety, does represent a codeable activity only if the people take to the streets to
demand better conditions, wages, safety, and the outcome is a function of state level policy decisions. A
union action targeted specifically at conditions linked to a specific company and are part of labor
negotiations do not constitute a protest, unless or until the labor action becomes a public event (by, say,
marching in the streets or demanding action on the part of the government).

In [2]:
# Datasets

# Mass mobilization:
harvardmm_df = pd.read_csv('Data/Mass_Mobilization_Harvard.csv')
harvardmm_df_original = harvardmm_df.copy


freedom_df = pd.read_excel('Data/2020_freedom_house_clean.xlsx')
freedom_df_original = freedom_df.copy

In [3]:
# Harvard dataset includes some rows were protests are not deemed relevant. These are indicated with a 0 in the column 'protests'
# I drop these lines since they are irrelevant to the current analysis

harvardmm_filtered = harvardmm_df[harvardmm_df['protest'] !=0]

# Most interested in Year, Country, Protest Name(protesterdemand1), Motivations (protesterdemand2), Peak Size (participants)
harvardmm_filtered = harvardmm_filtered[['year', 'country', 'protesterdemand1', 'protesterdemand2', 'participants']]
harvardmm_filtered

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants
0,1990,Canada,"political behavior, process",labor wage dispute,1000s
1,1990,Canada,"political behavior, process",,1000
2,1990,Canada,"political behavior, process",,500
3,1990,Canada,land farm issue,,100s
4,1990,Canada,"political behavior, process",,950
...,...,...,...,...,...
16355,2014,Papua New Guinea,"political behavior, process",,100+
16357,2016,Papua New Guinea,removal of politician,,About 1000
16358,2017,Papua New Guinea,"political behavior, process",land farm issue,50+
16359,2017,Papua New Guinea,"political behavior, process",,50+


In [4]:
# I want to add the freedom index from randomhouse to the harvard dataset too
# For this, replicate the years vertically and add the value per row and column downwards
freedom = freedom_df.copy()
freedom.rename(columns={'Year(s) Under Review':'country'}, inplace = True)
freedom.head()

Unnamed: 0,country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,NF,NF,NF,NF,NF,NF,NF,NF,NF,...,NF,NF,NF,NF,NF,NF,NF,NF,NF,NF
1,Albania,NF,PF,PF,PF,PF,PF,PF,PF,PF,...,PF,PF,PF,PF,PF,PF,PF,PF,PF,PF
2,Algeria,PF,PF,NF,NF,NF,NF,NF,NF,NF,...,NF,NF,NF,NF,NF,NF,NF,NF,NF,NF
3,Andorra,-,-,-,F,F,F,F,F,F,...,F,F,F,F,F,F,F,F,F,F
4,Angola,NF,PF,NF,NF,NF,NF,NF,NF,NF,...,NF,NF,NF,NF,NF,NF,NF,NF,NF,NF


In [5]:
list_of_cols = list(freedom.columns)
pivoted = pd.melt(freedom, id_vars = ['country'], value_vars=list_of_cols[1:], var_name = 'year', value_name='freedom')
pivoted.head()

Unnamed: 0,country,year,freedom
0,Afghanistan,1990,NF
1,Albania,1990,NF
2,Algeria,1990,PF
3,Andorra,1990,-
4,Angola,1990,NF


In [6]:
pivoted.shape

(6150, 3)

In [7]:
# insert pivoted into harvardmm_filtered
harvard = harvardmm_filtered.merge(pivoted, on=['country', 'year'], how = 'left')
harvard

#Check: the df keeps the same number of rows as harvardmm_filtered

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom
0,1990,Canada,"political behavior, process",labor wage dispute,1000s,F
1,1990,Canada,"political behavior, process",,1000,F
2,1990,Canada,"political behavior, process",,500,F
3,1990,Canada,land farm issue,,100s,F
4,1990,Canada,"political behavior, process",,950,F
...,...,...,...,...,...,...
14509,2014,Papua New Guinea,"political behavior, process",,100+,PF
14510,2016,Papua New Guinea,removal of politician,,About 1000,PF
14511,2017,Papua New Guinea,"political behavior, process",land farm issue,50+,PF
14512,2017,Papua New Guinea,"political behavior, process",,50+,PF


In [8]:
# participants column cleanup
harvard.isnull().sum(axis = 0)

year                    0
country                 0
protesterdemand1       11
protesterdemand2    11838
participants           12
freedom               513
dtype: int64

In [9]:
harvard.participants.fillna('', inplace=True)

In [10]:
# harvard.participants.unique()

In [11]:
def number_cleaner(text):
    a = re.findall(r'[-><]\s*(\d+)', text)
    if a == []:
        b = re.findall(r'\d+', text)
        if b ==[]:
            return text
        else:
            return b[0]
    else:
        return a[0]

harvard['participants'] = harvard.participants.apply(number_cleaner)

In [12]:
harvard.loc[harvard['participants'] =='dozens','participants'] ='12'
harvard.loc[harvard['participants'] =='several dozen','participants'] ='12'
harvard.loc[harvard['participants'] =='a group','participants'] ='unknown'
harvard.loc[harvard['participants'] =='busloads','participants'] ='unknown'
harvard.loc[harvard['participants'] =='widespread','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Hundreds','participants'] ='100'
harvard.loc[harvard['participants'] =='hundreds','participants'] ='100'
harvard.loc[harvard['participants'] =='hundreds ','participants'] ='100'
harvard.loc[harvard['participants'] =='several hundred','participants'] ='100'
harvard.loc[harvard['participants'] =='several thousand','participants'] ='1000'
harvard.loc[harvard['participants'] =='Thousands','participants'] ='1000'
harvard.loc[harvard['participants'] =='tens of thousands','participants'] ='10000'
harvard.loc[harvard['participants'] =='hundreds of thousands','participants'] ='100000'
harvard.loc[harvard['participants'] =='several dozen arrests','participants'] ='unknown'
harvard.loc[harvard['participants'] =='a few dozen','participants'] ='12'
harvard.loc[harvard['participants'] =='thousands','participants'] ='1000'
harvard.loc[harvard['participants'] =='a few thousand','participants'] ='1000'
harvard.loc[harvard['participants'] =='Several thousand','participants'] ='1000'
harvard.loc[harvard['participants'] =='scores','participants'] ='unknown'
harvard.loc[harvard['participants'] =='more than a dozen schools','participants'] ='12'
harvard.loc[harvard['participants'] =='dozens arrested ','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Thousands ','participants'] ='1000'
harvard.loc[harvard['participants'] =='very low thousands','participants'] ='1000'
harvard.loc[harvard['participants'] =='several hundreds','participants'] ='100'
harvard.loc[harvard['participants'] =='hundreds  ','participants'] ='100'
harvard.loc[harvard['participants'] =='hundreds of thousands ','participants'] ='100000'
harvard.loc[harvard['participants'] =='Hundreds of thousands','participants'] ='100000'
harvard.loc[harvard['participants'] =='Dozens','participants'] ='12'
harvard.loc[harvard['participants'] =='several dozens','participants'] ='12'
harvard.loc[harvard['participants'] =='millions','participants'] ='1000000'
harvard.loc[harvard['participants'] =='teachers and pupils','participants'] ='unknown'
harvard.loc[harvard['participants'] =='few thousand','participants'] ='1000'
harvard.loc[harvard['participants'] =='several hundred thousands','participants'] ='100000'
harvard.loc[harvard['participants'] =='a few thousand ','participants'] ='1000'
harvard.loc[harvard['participants'] =='couple thousands','participants'] ='2000'
harvard.loc[harvard['participants'] =='a few hundred','participants'] ='100'
harvard.loc[harvard['participants'] =='several thousand ','participants'] ='1000'
harvard.loc[harvard['participants'] =='dozens arrested','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Hundreds of people','participants'] ='100'
harvard.loc[harvard['participants'] =='Hundreds of workers','participants'] ='100'
harvard.loc[harvard['participants'] =='Thousands of miners','participants'] ='1000'
harvard.loc[harvard['participants'] =='Several hundred demonstrators','participants'] ='100'
harvard.loc[harvard['participants'] =='Hundreds of villagers','participants'] ='100'
harvard.loc[harvard['participants'] =='Thousands of people','participants'] ='1000'
harvard.loc[harvard['participants'] =='A few hundred','participants'] ='100'
harvard.loc[harvard['participants'] =='Tens of thousands of people','participants'] ='100'
harvard.loc[harvard['participants'] =='few dozen','participants'] ='100'
harvard.loc[harvard['participants'] =='Unknown','participants'] ='unknown'

In [13]:
harvard.loc[harvard['participants'] =='Several thousand protesters','participants'] ='1000'
harvard.loc[harvard['participants'] =='Demonstrators','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Tens of thousands','participants'] ='10000'
harvard.loc[harvard['participants'] =='Large demonstrations','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Huge protests','participants'] ='1000'
harvard.loc[harvard['participants'] =='Hundreds of student protesters','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Mass protests','participants'] ='1000'
harvard.loc[harvard['participants'] =='HUNDREDS OF thousands of Venezuelans','participants'] ='100000'
harvard.loc[harvard['participants'] =='Rocked by protests','participants'] ='unknown'
harvard.loc[harvard['participants'] =='More than seven million people','participants'] ='7000000'
harvard.loc[harvard['participants'] =='Millions','participants'] ='1000000'
harvard.loc[harvard['participants'] =='Hundreds of youth','participants'] ='unknown'
harvard.loc[harvard['participants'] =='A general strike that paralysed Venezuela','participants'] ='1000'
harvard.loc[harvard['participants'] =='Protesters on whom the government security forces used water cannons, rubber bullets and batons','participants'] ='unknown'
harvard.loc[harvard['participants'] =='A few hundred people','participants'] ='100'
harvard.loc[harvard['participants'] =='About a hundred protesters','participants'] ='100'
harvard.loc[harvard['participants'] =='Hundreds of Venezuelans','participants'] ='100'
harvard.loc[harvard['participants'] =='up to two million','participants'] ='2000000'
harvard.loc[harvard['participants'] =='increasingly large gatherings','participants'] ='unknown'
harvard.loc[harvard['participants'] =='over a million','participants'] ='1000000'
harvard.loc[harvard['participants'] =='Crowds','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Hundreds of college students','participants'] ='100'
harvard.loc[harvard['participants'] =='Widespread protests throughout the valley','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Volunteers and a line formed of people participating to cook beef and share it with strangers in the streets','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Thousands of Indians, including Bollywood actors','participants'] ='1000'
harvard.loc[harvard['participants'] =='Civilians; a crowd','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Protests across India','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Hundreds of residents','participants'] ='100'
harvard.loc[harvard['participants'] =='Several hundred','participants'] ='100'
harvard.loc[harvard['participants'] =='Mass demonstrations','participants'] ='100'

In [14]:
harvard.loc[harvard['participants'] =='"Rocked by protests"','participants'] ='unknown'
harvard.loc[harvard['participants'] =='Hundreds of youths','participants'] ='100'

In [15]:
harvard.participants.unique()

array(['1000', '500', '100', '950', '200', '110000', '10000', '6000',
       '3000', '126000', '45000', '2000', '400', '10', '250', '170000',
       '300', '12000', '1500', '50', '12', 'unknown', '80', '2500', '150',
       '8000', '4000', '15000', '600', '270', '350', '5000', '20000',
       '100000', '30000', '70000', '80000', '60', '200000', '750000',
       '90000', '150000', '400000', '1000000', '50000', '11000', '230',
       '180', '35000', '', '40000', '130', '60000', '23000', '14000',
       '900', '1200', '1700', '700', '800', '280', '2400', '90', '164',
       '7000', '25000', '240000', '1100', '4500', '75000', '250000',
       '500000', '300000', '120000', '13800', '600000', '800000',
       '900000', '700000', '350000', '450000', '5', '53000', '280000',
       '72000', '270000', '1', '1300000', '22000', '135000', '9000',
       '290000', '44000', '36000', '168000', '20', '7000000', '16', '75',
       '1400', '4800', '2450', '6500', '181', '32000', '18500', '.',
       '240

In [28]:
#harvard[~harvard.participants.str.contains("unknown")]

In [26]:
harvard1 = harvard[harvard['participants'].str.contains('unknown')] 
harvard1

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom
40,2016,Canada,"political behavior, process",,unknown,F
43,2016,Canada,"political behavior, process",,unknown,F
45,2018,Canada,"political behavior, process",,unknown,F
1342,2016,Venezuela,"price increases, tax policy",,unknown,NF
1354,2017,Venezuela,"political behavior, process",,unknown,NF
1355,2017,Venezuela,"political behavior, process",,unknown,NF
1359,2017,Venezuela,"political behavior, process",,unknown,NF
1364,2017,Venezuela,"political behavior, process",,unknown,NF
1369,2017,Venezuela,"political behavior, process",,unknown,NF
1976,2016,Brazil,"political behavior, process",,unknown,F


In [27]:
len(harvard1)

24

In [44]:
harvard2 = harvard[(harvard.participants != 'unknown') & (harvard.participants != '.')].copy()

In [46]:
harvard2['participants'] = pd.to_numeric(harvard2["participants"])

In [47]:
harvard2.dtypes

year                 object
country              object
protesterdemand1     object
protesterdemand2     object
participants        float64
freedom              object
dtype: object

In [165]:
harvard3 = harvard2[(harvard2.participants >= 1000)]
harvard3

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom
0,1990,Canada,"political behavior, process",labor wage dispute,1000.0,F
1,1990,Canada,"political behavior, process",,1000.0,F
6,1991,Canada,labor wage dispute,,110000.0,F
7,1991,Canada,labor wage dispute,,110000.0,F
8,1992,Canada,police brutality,,1000.0,F
...,...,...,...,...,...,...
14505,2012,Papua New Guinea,"political behavior, process",,1000.0,PF
14506,2012,Papua New Guinea,"political behavior, process",,1000.0,PF
14507,2012,Papua New Guinea,"political behavior, process",,1000.0,PF
14508,2013,Papua New Guinea,"political behavior, process",,2000.0,PF


In [166]:
harvard3['year'] = pd.to_numeric(harvard3['year'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  harvard3['year'] = pd.to_numeric(harvard3['year'])


In [167]:
harvard3.dtypes

year                  int64
country              object
protesterdemand1     object
protesterdemand2     object
participants        float64
freedom              object
dtype: object

In [168]:
harvard3.isnull().sum(axis = 0)

year                   0
country                0
protesterdemand1       6
protesterdemand2    5181
participants           0
freedom              195
dtype: int64

In [170]:
counts = []
for i in harvard3['year']:
    counts.append(1)
       
harvard3["counts"] = counts   
harvard3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  harvard3["counts"] = counts


Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom,counts
0,1990,Canada,"political behavior, process",labor wage dispute,1000.0,F,1
1,1990,Canada,"political behavior, process",,1000.0,F,1
6,1991,Canada,labor wage dispute,,110000.0,F,1
7,1991,Canada,labor wage dispute,,110000.0,F,1
8,1992,Canada,police brutality,,1000.0,F,1
...,...,...,...,...,...,...,...
14505,2012,Papua New Guinea,"political behavior, process",,1000.0,PF,1
14506,2012,Papua New Guinea,"political behavior, process",,1000.0,PF,1
14507,2012,Papua New Guinea,"political behavior, process",,1000.0,PF,1
14508,2013,Papua New Guinea,"political behavior, process",,2000.0,PF,1


Unnamed: 0,country,freedom
0,Afghanistan,NF
1,Albania,PF
2,Algeria,NF
3,Angola,NF
4,Argentina,F
...,...,...
153,Vietnam,NF
154,Yemen,NF
155,Yugoslavia,PF
156,Zambia,PF


In [255]:
# save dataframe
harvard3.to_csv('./Data/harvard_clean.csv', encoding='utf-8')
harvard3.to_excel('./Data/harvard_clean.xlsx')

#### Protests per year globally

In [171]:
protest_year = harvard3['year'].value_counts()
protest_year = protest_year.sort_index()
protest_year = protest_year.to_frame()

In [172]:
participants_year = harvard3.groupby('year')['participants'].mean().round(0)
participants_year = participants_year.to_frame()

In [173]:
yearly_protests = protest_year.join(participants_year)
yearly_protests.columns=['protest_count', 'participants']

In [174]:
yearly_protests.reset_index(inplace=True)

In [175]:
yearly_protests

Unnamed: 0,index,protest_count,participants
0,1990,327,23971.0
1,1991,221,45619.0
2,1992,204,47143.0
3,1993,148,24581.0
4,1994,188,38029.0
5,1995,164,37247.0
6,1996,208,45688.0
7,1997,253,31536.0
8,1998,228,21580.0
9,1999,183,25319.0


In [256]:
yearly_protests.dtypes

index              int64
protest_count      int64
participants     float64
dtype: object

In [250]:
# save dataframe
yearly_protests.to_csv('./Data/yearly_protests.csv', encoding='utf-8')
yearly_protests.to_excel('./Data/yearly_protests.xlsx')

#### Protests per country 

In [177]:
protests_country=harvard3.groupby(['country', 'year']).aggregate({'counts': np.count_nonzero,
                                          'participants': np.mean}).round(0)

In [178]:
protests_country.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,participants
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1991,1,2000.0
Afghanistan,1997,1,2000.0
Afghanistan,1998,1,2000.0
Afghanistan,2011,1,10000.0
Afghanistan,2012,1,1500.0
Afghanistan,2014,2,8000.0
Afghanistan,2015,3,4000.0
Afghanistan,2016,2,5500.0
Afghanistan,2017,2,1500.0
Afghanistan,2018,1,5000.0


In [251]:
# save dataframe
protests_country.to_csv('./Data/protests_country.csv', encoding='utf-8')
protests_country.to_excel('./Data/protests_country.xlsx')

In [268]:
protests_country_global=harvard3.groupby(['country']).aggregate({'counts': np.count_nonzero,
                                          'participants': np.mean, 'freedom': pd.Series.mode}).round(0)

protests_country_global

Unnamed: 0_level_0,counts,participants,freedom
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,15,4300.0,NF
Albania,52,10019.0,PF
Algeria,31,76000.0,NF
Angola,7,4286.0,NF
Argentina,73,44034.0,F
...,...,...,...
Vietnam,7,2286.0,NF
Yemen,87,36828.0,NF
Yugoslavia,108,28479.0,PF
Zambia,24,2708.0,PF


In [269]:
protests_country_global.sort_values('counts', ascending=False).head(50)

Unnamed: 0_level_0,counts,participants,freedom
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
South Korea,248,27097.0,F
France,245,113528.0,F
Germany,225,30805.0,F
United Kingdom,197,24266.0,F
Romania,163,17077.0,F
Greece,160,21702.0,F
Venezuela,146,145518.0,PF
Thailand,146,18645.0,PF
China,133,25492.0,NF
Ireland,117,9044.0,F


In [289]:
protests_country_global.sort_values('participants', ascending=False).head(50)

Unnamed: 0_level_0,counts,participants,freedom
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,84,185556.0,F
Venezuela,146,145518.0,PF
Lebanon,34,141118.0,PF
Spain,110,119214.0,F
France,245,113528.0,F
Colombia,98,107560.0,PF
Morocco,20,87850.0,PF
Bangladesh,96,86271.0,PF
India,103,84828.0,F
Madagascar,38,76816.0,PF


#### No. of protests vs. economic freedom index

In [221]:
grouped = pd.DataFrame(harvard3.groupby(['country'])['freedom'].agg(pd.Series.mode))
grouped.reset_index(inplace=True)
grouped

Unnamed: 0,country,freedom
0,Afghanistan,NF
1,Albania,PF
2,Algeria,NF
3,Angola,NF
4,Argentina,F
...,...,...
153,Vietnam,NF
154,Yemen,NF
155,Yugoslavia,PF
156,Zambia,PF


In [253]:
# save dataframe
grouped.to_csv('./Data/country_freedom.csv', encoding='utf-8')
grouped.to_excel('./Data/country_freedom.xlsx')

In [237]:
harvard3['freedom'].astype('category')

0         F
1         F
6         F
7         F
8         F
         ..
14505    PF
14506    PF
14507    PF
14508    PF
14510    PF
Name: freedom, Length: 6499, dtype: category
Categories (4, object): [-, F, NF, PF]

In [239]:
freedom_dummies=pd.get_dummies(harvard3['freedom'])
freedom_dummies

Unnamed: 0,-,F,NF,PF
0,0,1,0,0
1,0,1,0,0
6,0,1,0,0
7,0,1,0,0
8,0,1,0,0
...,...,...,...,...
14505,0,0,0,1
14506,0,0,0,1
14507,0,0,0,1
14508,0,0,0,1


In [242]:
merged_freedom=pd.concat([harvard3['counts'], freedom_dummies], axis=1)

In [244]:
merged_freedom2=pd.concat([harvard3['country'], merged_freedom], axis=1)
merged_freedom2

Unnamed: 0,country,counts,-,F,NF,PF
0,Canada,1,0,1,0,0
1,Canada,1,0,1,0,0
6,Canada,1,0,1,0,0
7,Canada,1,0,1,0,0
8,Canada,1,0,1,0,0
...,...,...,...,...,...,...
14505,Papua New Guinea,1,0,0,0,1
14506,Papua New Guinea,1,0,0,0,1
14507,Papua New Guinea,1,0,0,0,1
14508,Papua New Guinea,1,0,0,0,1


In [246]:
merged_freedom3=merged_freedom2.groupby(['country']).aggregate({'counts': np.count_nonzero,
                                          'F': np.mean, 'NF': np.mean, 'PF': np.mean}).round(0)
merged_freedom3

Unnamed: 0_level_0,counts,F,NF,PF
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,15,0.0,1.0,0.0
Albania,52,0.0,0.0,1.0
Algeria,31,0.0,1.0,0.0
Angola,7,0.0,1.0,0.0
Argentina,73,1.0,0.0,0.0
...,...,...,...,...
Vietnam,7,0.0,1.0,0.0
Yemen,87,0.0,1.0,0.0
Yugoslavia,108,0.0,0.0,1.0
Zambia,24,0.0,0.0,1.0


In [249]:
freedom_correlation=merged_freedom3.corr()
freedom_correlation

Unnamed: 0,counts,F,NF,PF
counts,1.0,0.162047,-0.16142,0.034226
F,0.162047,1.0,-0.354728,-0.525784
NF,-0.16142,-0.354728,1.0,-0.391306
PF,0.034226,-0.525784,-0.391306,1.0


In [254]:
# save dataframe
freedom_correlation.to_csv('./Data/freedom_corr.csv', encoding='utf-8')
freedom_correlation.to_excel('./Data/freedom_corr.xlsx')

In [233]:
grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  158 non-null    object
 1   freedom  158 non-null    object
dtypes: object(2)
memory usage: 2.6+ KB


In [247]:
protests_freedom=harvard3.groupby(['country', 'year']).aggregate({'counts': np.count_nonzero,
                                          'participants': np.mean}).round(0)

protests_freedom

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,participants
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1991,1,2000.0
Afghanistan,1997,1,2000.0
Afghanistan,1998,1,2000.0
Afghanistan,2011,1,10000.0
Afghanistan,2012,1,1500.0
...,...,...,...
Zimbabwe,2005,1,3000.0
Zimbabwe,2006,1,1000.0
Zimbabwe,2012,1,1000.0
Zimbabwe,2017,1,10000.0


In [176]:
harvard3.loc[harvard3['country'] == 'Afghanistan']

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom,counts
11489,1991,Afghanistan,police brutality,,2000.0,NF,1
11490,1997,Afghanistan,police brutality,,2000.0,NF,1
11491,1998,Afghanistan,social restrictions,,2000.0,NF,1
11496,2011,Afghanistan,"political behavior, process",,10000.0,NF,1
11499,2012,Afghanistan,removal of politician,,1500.0,NF,1
11503,2014,Afghanistan,"political behavior, process",,1000.0,NF,1
11504,2014,Afghanistan,"political behavior, process",,15000.0,NF,1
11508,2015,Afghanistan,"political behavior, process",police brutality,1000.0,NF,1
11510,2015,Afghanistan,"political behavior, process",,1000.0,NF,1
11511,2015,Afghanistan,"political behavior, process",removal of politician,10000.0,NF,1


In [271]:
protests_reasons=harvard3.groupby(['protesterdemand1']).aggregate({'counts': np.count_nonzero,
                                          'participants': np.mean}).round(0)

protests_reasons

Unnamed: 0_level_0,counts,participants
protesterdemand1,Unnamed: 1_level_1,Unnamed: 2_level_1
labor wage dispute,702,74446.0
land farm issue,126,8008.0
police brutality,248,22505.0
"political behavior, process",4239,31819.0
"price increases, tax policy",443,22933.0
removal of politician,567,36129.0
social restrictions,168,26232.0


In [278]:
protests_reasons=protests_reasons.sort_values('counts', ascending=False).head(50)
protests_reasons

Unnamed: 0_level_0,counts,participants
protesterdemand1,Unnamed: 1_level_1,Unnamed: 2_level_1
"political behavior, process",4239,31819.0
labor wage dispute,702,74446.0
removal of politician,567,36129.0
"price increases, tax policy",443,22933.0
police brutality,248,22505.0
social restrictions,168,26232.0
land farm issue,126,8008.0


In [279]:
# save dataframe
protests_reasons.to_csv('./Data/reasons.csv', encoding='utf-8')
protests_reasons.to_excel('./Data/reasons.xlsx')