In [3]:
# Libraries

import pandas as pd
import numpy as np
import matplotlib as plt

In [12]:
# Datasets

# Mass mobilization:
carnegiemm_df = pd.read_csv('Data/Global_Protest_Tracker.csv')
harvardmm_df = pd.read_csv('Data/Mass_Mobilization_Harvard.csv')
freedom_df = pd.read_excel('Data/2020_freedom_house_clean.xlsx')

carnegiemm_df_original = carnegiemm_df.copy
harvardmm_df_original = harvardmm_df.copy
freedom_df_original = freedom_df.copy

## Carnegie mass mobilization data exploration

**About the dataset**

Source: https://carnegieendowment.org/publications/interactive/protest-tracker?gclid=Cj0KCQjw6uT4BRD5ARIsADwJQ19KBrMTliLck_-hYMpZhoKbwfuMqAaFWQZbUSb2gFZVszeJ6vKutHcaAvtrEALw_wcB


All the protests included had +700 people attending at their peak moment

In [13]:
# rows, columns

carnegiemm_df.shape

(128, 11)

In [14]:
carnegiemm_df.head()

Unnamed: 0,Country,Protest Name,Start Date,Freedom Rating (Status) at Start Date,Triggers,Motivations,Peak Size,Key Participants,Duration,Outcomes,Unnamed: 10
0,Albania,Local elections protests,Feb-19,Partly free,Suspicions that Prime Minister Edi Rama engage...,Electoral fraud and corruption.,">10,000",Opposition parties.,5 months,No policy/leadership change in response to the...,
1,Algeria,Protests against “Le Pouvoir”,Feb-19,Not free,Then president Abdelaziz Bouteflika’s decision...,"Economic downturn, corruption and nepotism, an...",">1,000,000",Opposition parties; a wide range of everyday c...,Active,Bouteflika resigned. Military and political el...,
2,Argentina,Austerity protests,Sep-18,Free,Proposal of austerity measures mandated by the...,"Rising inflation and unemployment, as well as ...",">10,000","Teachers, social organizations, leftist groups.",1 year (sporadic),No policy/leadership change in response to the...,
3,Argentina,Violence against women protests,Apr-20,Free,Increasing rates of violence and femicide agai...,Violence against women and lack of resources/i...,Thousands,Women.,1 month,The Supreme Court of Mendoza established sever...,
4,Armenia,“#RejectSerzh” protests,Apr-18,Partly free,Election of then president Serzh Sargsyan as p...,"Democratic backslide, poverty, and corruption.",">100,000","Students, opposition leaders.",2 weeks,Sargsyan resigned as prime minister and was su...,


In [15]:
carnegiemm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   Country                                128 non-null    object
 1   Protest Name                           128 non-null    object
 2   Start Date                             128 non-null    object
 3   Freedom Rating (Status) at Start Date  128 non-null    object
 4   Triggers                               128 non-null    object
 5   Motivations                            128 non-null    object
 6   Peak Size                              128 non-null    object
 7   Key Participants                       128 non-null    object
 8   Duration                               128 non-null    object
 9   Outcomes                               128 non-null    object
 10  Unnamed: 10                            1 non-null      object
dtypes: object(11)
memor

In [16]:
# Most interested in Country, Protest Name, Start Date, Freedom Rating, Motivations, Peak Size
carnegiemm_filtered = carnegiemm_df[['Country', 'Protest Name', 'Start Date', 'Freedom Rating (Status) at Start Date', 'Motivations', 'Peak Size']]

# Note: Freedom Rating reflects the rating of a country in the Freedom House’s annual Freedom in the World report at the time the protest started, reflecting the status of political rights and civil liberties in that country.

In [17]:
# rename freedom rate column
carnegie_filtered = carnegiemm_filtered.rename(columns={'Freedom Rating (Status) at Start Date':'freedom'}, inplace=True)
carnegie_filtered

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [18]:
# Turn Start Date into just the year + int type

#for csv
carnegiemm_filtered['Year'] = carnegiemm_filtered['Start Date'].str[-2:]
carnegiemm_filtered['Year'] = '20' + carnegiemm_filtered['Year'].astype(str)
carnegiemm_filtered['Year'] = carnegiemm_filtered['Year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  carnegiemm_filtered['Year'] = carnegiemm_filtered['Start Date'].str[-2:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  carnegiemm_filtered['Year'] = '20' + carnegiemm_filtered['Year'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  carnegiemm_filtered['Year'] = carnegiemm_filtered['Year

In [19]:
carnegiemm_filtered2 = carnegiemm_filtered[['Year', 'Country', 'freedom', 'Protest Name', 'Motivations', 'Peak Size']]
carnegiemm_filtered2

Unnamed: 0,Year,Country,freedom,Protest Name,Motivations,Peak Size
0,2019,Albania,Partly free,Local elections protests,Electoral fraud and corruption.,">10,000"
1,2019,Algeria,Not free,Protests against “Le Pouvoir”,"Economic downturn, corruption and nepotism, an...",">1,000,000"
2,2018,Argentina,Free,Austerity protests,"Rising inflation and unemployment, as well as ...",">10,000"
3,2020,Argentina,Free,Violence against women protests,Violence against women and lack of resources/i...,Thousands
4,2018,Armenia,Partly free,“#RejectSerzh” protests,"Democratic backslide, poverty, and corruption.",">100,000"
...,...,...,...,...,...,...
123,2018,Uruguay,Free,Farmer protest,Lack of government support for farmers and inc...,">10,000"
124,2017,Venezuela,Not free,“Mother of All Marches” protests,"Growing authoritarianism, economic crisis, and...",200000
125,2019,Venezuela,Not free,January 2019 protests,"Economic collapse, corruption, food shortages,...",">100,000"
126,2018,Vietnam,Not free,Cybersecurity/special economic zone protests,Fear of growing Chinese influence through spec...,50000


## Harvard mass mobilization data exploration

**About the dataset**

Source: https://massmobilization.github.io

Protest is a dichotomous coding for whether or not there was a protest action in a particular period
The project defines a protest as a gathering of 50 or more people to make a demand of the government. A
protest action must be targeted at the state or state policy. The project does not code protests in one
country that are targeted at the policies of another country and in that sense, it captures only “home
grown” protest activities targeted at state policies.

We are interested in mobilization of anti-state protest, not necessarily community disputes between
various groups, and therefore we do not code inter-communal demonstrations. 

Protests that involve an industrial enterprise that is affected by state policy
such as labor rights or safety, does represent a codeable activity only if the people take to the streets to
demand better conditions, wages, safety, and the outcome is a function of state level policy decisions. A
union action targeted specifically at conditions linked to a specific company and are part of labor
negotiations do not constitute a protest, unless or until the labor action becomes a public event (by, say,
marching in the streets or demanding action on the part of the government).

In [20]:
# rows, columns
harvardmm_df.shape

(16363, 31)

In [21]:
harvardmm_df.head()

Unnamed: 0,id,country,ccode,year,region,protest,protestnumber,startday,startmonth,startyear,...,protesterdemand4,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7,sources,notes
0,201990001,Canada,20,1990,North America,1,1,15.0,1.0,1990.0,...,,ignore,,,,,,,1. Great Canadian train journeys into history;...,Canada s railway passenger system was finally...
1,201990002,Canada,20,1990,North America,1,2,25.0,6.0,1990.0,...,,ignore,,,,,,,1. Autonomy s Cry Revived in Quebec The New Yo...,protestors were only identified as young peop...
2,201990003,Canada,20,1990,North America,1,3,1.0,7.0,1990.0,...,,ignore,,,,,,,1. Quebec protest after Queen calls for unity ...,"THE Queen, after calling on Canadians to rema..."
3,201990004,Canada,20,1990,North America,1,4,12.0,7.0,1990.0,...,,accomodation,,,,,,,1. Indians Gather as Siege Intensifies; Armed ...,Canada s federal government has agreed to acq...
4,201990005,Canada,20,1990,North America,1,5,14.0,8.0,1990.0,...,,crowd dispersal,arrests,accomodation,,,,,1. Dozens hurt in Mohawk blockade protest The ...,Protests were directed against the state due t...


In [22]:
harvardmm_df.tail()

Unnamed: 0,id,country,ccode,year,region,protest,protestnumber,startday,startmonth,startyear,...,protesterdemand4,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7,sources,notes
16358,9102017001,Papua New Guinea,910,2017,Oceania,1,1,15.0,6.0,2017.0,...,,accomodation,,,,,,,Bougainville imposes moratorium on Panguna mi...,The Bougainville government has enacted an ind...
16359,9102017002,Papua New Guinea,910,2017,Oceania,1,2,15.0,7.0,2017.0,...,,crowd dispersal,,,,,,,"Violence, chaos and fraud: fraught Papua New ...",Peter O Neill has been reappointed as prime mi...
16360,9102017003,Papua New Guinea,910,2017,Oceania,1,3,31.0,10.0,2017.0,...,,ignore,,,,,,,Refugees dig in as camp closes; Manus Situa...,Refugees on Manus Island were braced for poten...
16361,9102018000,Papua New Guinea,910,2018,Oceania,0,0,,,,...,,,,,,,,,,
16362,9102019000,Papua New Guinea,910,2019,Oceania,0,0,,,,...,.,,,,,.,,,,


In [45]:
harvardmm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16363 entries, 0 to 16362
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     16363 non-null  int64  
 1   country                16363 non-null  object 
 2   ccode                  16363 non-null  int64  
 3   year                   16363 non-null  int64  
 4   region                 16363 non-null  object 
 5   protest                16363 non-null  int64  
 6   protestnumber          16363 non-null  int64  
 7   startday               14514 non-null  float64
 8   startmonth             14514 non-null  float64
 9   startyear              14514 non-null  float64
 10  endday                 14514 non-null  float64
 11  endmonth               14514 non-null  float64
 12  endyear                14514 non-null  float64
 13  protesterviolence      15033 non-null  float64
 14  location               14493 non-null  object 
 15  pa

In [46]:
# Harvard dataset includes some rows were protests are not deemed relevant. These are indicated with a 0 in the column 'protests'
# I drop these lines since they are irrelevant to the current analysis

harvardmm_filtered = harvardmm_df[harvardmm_df['protest'] !=0]

# Most interested in Year, Country, Protest Name(protesterdemand1), Motivations (protesterdemand2), Peak Size (participants)
harvardmm_filtered = harvardmm_filtered[['year', 'country', 'protesterdemand1', 'protesterdemand2', 'participants']]
harvardmm_filtered

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants
0,1990,Canada,"political behavior, process",labor wage dispute,1000s
1,1990,Canada,"political behavior, process",,1000
2,1990,Canada,"political behavior, process",,500
3,1990,Canada,land farm issue,,100s
4,1990,Canada,"political behavior, process",,950
...,...,...,...,...,...
16355,2014,Papua New Guinea,"political behavior, process",,100+
16357,2016,Papua New Guinea,removal of politician,,About 1000
16358,2017,Papua New Guinea,"political behavior, process",land farm issue,50+
16359,2017,Papua New Guinea,"political behavior, process",,50+


In [47]:
# I want to add the freedom index from randomhouse to the harvard dataset too
# For this, replicate the years vertically and add the value per row and column downwards
freedom = freedom_df.copy()
freedom.rename(columns={'Year(s) Under Review':'country'}, inplace = True)
freedom.head()

Unnamed: 0,country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,NF,NF,NF,NF,NF,NF,NF,NF,NF,...,NF,NF,NF,NF,NF,NF,NF,NF,NF,NF
1,Albania,NF,PF,PF,PF,PF,PF,PF,PF,PF,...,PF,PF,PF,PF,PF,PF,PF,PF,PF,PF
2,Algeria,PF,PF,NF,NF,NF,NF,NF,NF,NF,...,NF,NF,NF,NF,NF,NF,NF,NF,NF,NF
3,Andorra,-,-,-,F,F,F,F,F,F,...,F,F,F,F,F,F,F,F,F,F
4,Angola,NF,PF,NF,NF,NF,NF,NF,NF,NF,...,NF,NF,NF,NF,NF,NF,NF,NF,NF,NF


In [48]:
# check that country returns the desired column
freedom['country']

0      Afghanistan
1          Albania
2          Algeria
3          Andorra
4           Angola
          ...     
200      Yemen, N.
201      Yemen, S.
202     Yugoslavia
203         Zambia
204       Zimbabwe
Name: country, Length: 205, dtype: object

In [49]:
# prepare list for melt
list_of_cols = list(freedom.columns)

In [50]:
# melt
pivoted = pd.melt(freedom, id_vars = ['country'], value_vars=list_of_cols[1:], var_name = 'year', value_name='freedom')
pivoted.head()

Unnamed: 0,country,year,freedom
0,Afghanistan,1990,NF
1,Albania,1990,NF
2,Algeria,1990,PF
3,Andorra,1990,-
4,Angola,1990,NF


In [51]:
pivoted.shape

(6150, 3)

In [52]:
# insert pivoted into harvardmm_filtered
harvard_merged = harvardmm_filtered.merge(pivoted, on=['country', 'year'], how = 'left')
harvard_merged

#Check: the df keeps the same number of rows as harvardmm_filtered

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom
0,1990,Canada,"political behavior, process",labor wage dispute,1000s,F
1,1990,Canada,"political behavior, process",,1000,F
2,1990,Canada,"political behavior, process",,500,F
3,1990,Canada,land farm issue,,100s,F
4,1990,Canada,"political behavior, process",,950,F
...,...,...,...,...,...,...
14509,2014,Papua New Guinea,"political behavior, process",,100+,PF
14510,2016,Papua New Guinea,removal of politician,,About 1000,PF
14511,2017,Papua New Guinea,"political behavior, process",land farm issue,50+,PF
14512,2017,Papua New Guinea,"political behavior, process",,50+,PF


In [53]:
# change names so that they are the same as in carnegiemm_filtered2

harvard_merged['freedom'] = harvard_merged['freedom'].str.replace('NF','Not free')
harvard_merged['freedom'] = harvard_merged['freedom'].str.replace('PF','Partly free')
harvard_merged['freedom'] = harvard_merged['freedom'].str.replace('F','Free')
harvard_merged

Unnamed: 0,year,country,protesterdemand1,protesterdemand2,participants,freedom
0,1990,Canada,"political behavior, process",labor wage dispute,1000s,Free
1,1990,Canada,"political behavior, process",,1000,Free
2,1990,Canada,"political behavior, process",,500,Free
3,1990,Canada,land farm issue,,100s,Free
4,1990,Canada,"political behavior, process",,950,Free
...,...,...,...,...,...,...
14509,2014,Papua New Guinea,"political behavior, process",,100+,Partly free
14510,2016,Papua New Guinea,removal of politician,,About 1000,Partly free
14511,2017,Papua New Guinea,"political behavior, process",land farm issue,50+,Partly free
14512,2017,Papua New Guinea,"political behavior, process",,50+,Partly free


In [54]:
# rename columns
harvard_merged_rename=harvard_merged.rename(columns={"year": "Year", "country": "Country", "protesterdemand1": "Protest Name", "protesterdemand2": "Motivations", "participants":"Peak Size"})

In [55]:
#put the columns in the same order
harvard_merged_rename = harvard_merged_rename[['Year', 'Country', 'freedom', 'Protest Name', 'Motivations', 'Peak Size']]
harvard_merged_rename

Unnamed: 0,Year,Country,freedom,Protest Name,Motivations,Peak Size
0,1990,Canada,Free,"political behavior, process",labor wage dispute,1000s
1,1990,Canada,Free,"political behavior, process",,1000
2,1990,Canada,Free,"political behavior, process",,500
3,1990,Canada,Free,land farm issue,,100s
4,1990,Canada,Free,"political behavior, process",,950
...,...,...,...,...,...,...
14509,2014,Papua New Guinea,Partly free,"political behavior, process",,100+
14510,2016,Papua New Guinea,Partly free,removal of politician,,About 1000
14511,2017,Papua New Guinea,Partly free,"political behavior, process",land farm issue,50+
14512,2017,Papua New Guinea,Partly free,"political behavior, process",,50+


In [56]:
# drop 2017 data since it is more complete in the carnegie datasets
harvard_merged_rename = harvard_merged_rename[harvard_merged_rename.Year!=2017]
harvard_merged_rename

Unnamed: 0,Year,Country,freedom,Protest Name,Motivations,Peak Size
0,1990,Canada,Free,"political behavior, process",labor wage dispute,1000s
1,1990,Canada,Free,"political behavior, process",,1000
2,1990,Canada,Free,"political behavior, process",,500
3,1990,Canada,Free,land farm issue,,100s
4,1990,Canada,Free,"political behavior, process",,950
...,...,...,...,...,...,...
14506,2012,Papua New Guinea,Partly free,"political behavior, process",,1000
14507,2012,Papua New Guinea,Partly free,"political behavior, process",,1000s
14508,2013,Papua New Guinea,Partly free,"political behavior, process",,2000
14509,2014,Papua New Guinea,Partly free,"political behavior, process",,100+


## Mass mobilization dataset

I will incorporate only the data of protests with more than 1000 people attending at its peak moment.

In [57]:
# Concatenate both datasets. I want to keep carnegiemm_filtered2 unaltered and add the values of harvard_merged to the df
protests=pd.concat([carnegiemm_filtered2,harvard_merged_rename],axis=0)#top/bottom

In [58]:
# Rename columns to more callable names
protests= protests.rename(columns={"Protest Name": "Protest_Name", "Peak Size": "Peak_Size"})

In [59]:
protests

Unnamed: 0,Year,Country,freedom,Protest_Name,Motivations,Peak_Size
0,2019,Albania,Partly free,Local elections protests,Electoral fraud and corruption.,">10,000"
1,2019,Algeria,Not free,Protests against “Le Pouvoir”,"Economic downturn, corruption and nepotism, an...",">1,000,000"
2,2018,Argentina,Free,Austerity protests,"Rising inflation and unemployment, as well as ...",">10,000"
3,2020,Argentina,Free,Violence against women protests,Violence against women and lack of resources/i...,Thousands
4,2018,Armenia,Partly free,“#RejectSerzh” protests,"Democratic backslide, poverty, and corruption.",">100,000"
...,...,...,...,...,...,...
14506,2012,Papua New Guinea,Partly free,"political behavior, process",,1000
14507,2012,Papua New Guinea,Partly free,"political behavior, process",,1000s
14508,2013,Papua New Guinea,Partly free,"political behavior, process",,2000
14509,2014,Papua New Guinea,Partly free,"political behavior, process",,100+


In [38]:
# save dataframe
protests.to_csv('./Data/protests.csv', encoding='utf-8')

In [60]:
# Peak_Size column cleanup
protests.isnull().sum(axis = 0)

Year                0
Country             0
freedom           492
Protest_Name       11
Motivations     11410
Peak_Size          12
dtype: int64

In [61]:
protests.Peak_Size.fillna('', inplace=True)

In [62]:
protests.Peak_Size.unique()

array(['>10,000', '>1,000,000', 'Thousands', '>100,000', '30,000',
       '5,000', '15,000', '>1,000', '1,500,000', 'Hundreds', 'Unknown',
       '20,000', '250,000', '4000', '200,000', '300,000', '800,000',
       '10,000', '40,000', '18,000', '22,000', '2,000,000', '100,000',
       '2000', '1,000,000', '12,500', 'Tens of thousands', '4,000',
       '60,000', '50,000', '60000', '500,000', '2,000', '37,500',
       '65,000', '1.5 million', '600,000', '62,500', '12,000',
       '1 million', '>700', '1000s', '1000', '500', '100s', '950', '200',
       '110000', '10000s', '10000', '6000', '3000', '126000', '45000',
       '400', '10s', '250', '170000', '300', '12000', '1500', '50+',
       '250-300', 'hundreds', 'more than 200', 'dozens', 'a group',
       'about 300', 'more than 500', 'busloads', 'widespread',
       'thousands', '100', '50', '100+', '80', '2500', '150',
       '100s-1000s', '8000', '100s-1000', '300-1000s', '15000', '600',
       '1500+', 'about 2000', 'several hundred

In [63]:
import re

def number_cleaner(text):
    a = re.findall(r'[-><]\s*(\d+)', text)
    if a == []:
        b = re.findall(r'\d+', text)
        if b ==[]:
            return text
        else:
            return b[0]
    else:
        return a[0]

protests['Peak_Size'] = protests.Peak_Size.apply(number_cleaner)

In [64]:
# Analysis of Peak_Size column for cleaning

protests.Peak_Size.unique()

array(['10', '1', 'Thousands', '100', '30', '5', '15', 'Hundreds',
       'Unknown', '20', '250', '4000', '200', '300', '800', '40', '18',
       '22', '2', '2000', '12', 'Tens of thousands', '4', '60', '50',
       '60000', '500', '37', '65', '600', '62', '700', '1000', '950',
       '110000', '10000', '6000', '3000', '126000', '45000', '400',
       '170000', '12000', '1500', 'hundreds', 'dozens', 'a group',
       'busloads', 'widespread', 'thousands', '80', '2500', '150', '8000',
       '15000', 'several hundred', 'tens of thousands', '270', '350',
       '5000', '20000', '100000', '30000', '70000', '80000', '200000',
       '750000', '90000', '150000', '400000', '1000000', '50000',
       'several dozen', '180', '35000', '', '40000', '130', '23000',
       '14000', '900', '1200', '1700', '280', '2400', '90', 'hundreds ',
       'hundreds of thousands', '164', '7000', '25000', '240000', '1100',
       '4500', '75000', '250000', '500000', '300000', '120000', '13800',
       '600000'

In [72]:
protests.loc[protests['Peak_Size'] =='dozens','Peak_Size'] ='12'
protests.loc[protests['Peak_Size'] =='several dozen','Peak_Size'] ='12'
protests.loc[protests['Peak_Size'] =='a group','Peak_Size'] ='unknown'
protests.loc[protests['Peak_Size'] =='busloads','Peak_Size'] ='unknown'
protests.loc[protests['Peak_Size'] =='widespread','Peak_Size'] ='unknown'
protests.loc[protests['Peak_Size'] =='Hundreds','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='hundreds','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='hundreds ','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='several hundred','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='several thousand','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='Thousands','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='Tens of thousands','Peak_Size'] ='10000'
protests.loc[protests['Peak_Size'] =='tens of thousands','Peak_Size'] ='10000'
protests.loc[protests['Peak_Size'] =='hundreds of thousands','Peak_Size'] ='100000'
protests.loc[protests['Peak_Size'] =='several dozen arrests','Peak_Size'] ='unknown'

In [74]:
protests.loc[protests['Peak_Size'] =='a few dozen','Peak_Size'] ='12'
protests.loc[protests['Peak_Size'] =='thousands','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='a few thousand','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='Several thousand','Peak_Size'] ='1000'

protests.loc[protests['Peak_Size'] =='scores','Peak_Size'] ='unknown'
protests.loc[protests['Peak_Size'] =='more than a dozen schools','Peak_Size'] ='12'
protests.loc[protests['Peak_Size'] =='dozens arrested ','Peak_Size'] ='unknown'
protests.loc[protests['Peak_Size'] =='Thousands ','Peak_Size'] ='1000'

protests.loc[protests['Peak_Size'] =='very low thousands','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='several hundreds','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='hundreds  ','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='hundreds of thousands ','Peak_Size'] ='100000'

In [78]:
protests.loc[protests['Peak_Size'] =='Hundreds of thousands','Peak_Size'] ='100000'
protests.loc[protests['Peak_Size'] =='Dozens','Peak_Size'] ='12'
protests.loc[protests['Peak_Size'] =='several dozens','Peak_Size'] ='12'
protests.loc[protests['Peak_Size'] =='millions','Peak_Size'] ='1000000'

protests.loc[protests['Peak_Size'] =='teachers and pupils','Peak_Size'] ='unknown'
protests.loc[protests['Peak_Size'] =='few thousand','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='several hundred thousands','Peak_Size'] ='100000'
protests.loc[protests['Peak_Size'] =='a few thousand ','Peak_Size'] ='1000'

protests.loc[protests['Peak_Size'] =='couple thousands','Peak_Size'] ='2000'
protests.loc[protests['Peak_Size'] =='a few hundred','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='several thousand ','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='dozens arrested','Peak_Size'] ='unknown'

In [80]:

protests.loc[protests['Peak_Size'] =='Hundreds of people','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='Hundreds of workers','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='Thousands of miners','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='Several hundred demonstrators','Peak_Size'] ='100'


protests.loc[protests['Peak_Size'] =='Hundreds of villagers','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='Thousands of people','Peak_Size'] ='1000'
protests.loc[protests['Peak_Size'] =='A few hundred','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='Tens of thousands of people','Peak_Size'] ='100'

In [91]:
protests.loc[protests['Peak_Size'] =='few dozen','Peak_Size'] ='100'
protests.loc[protests['Peak_Size'] =='Unknown','Peak_Size'] ='unknown'

In [92]:
protests.Peak_Size.unique()

array(['10', '1', '1000', '100', '30', '5', '15', 'unknown', '20', '250',
       '4000', '200', '300', '800', '40', '18', '22', '2', '2000', '12',
       '10000', '4', '60', '50', '60000', '500', '37', '65', '600', '62',
       '700', '950', '110000', '6000', '3000', '126000', '45000', '400',
       '170000', '12000', '1500', '80', '2500', '150', '8000', '15000',
       '270', '350', '5000', '20000', '100000', '30000', '70000', '80000',
       '200000', '750000', '90000', '150000', '400000', '1000000',
       '50000', '180', '35000', '', '40000', '130', '23000', '14000',
       '900', '1200', '1700', '280', '2400', '90', '164', '7000', '25000',
       '240000', '1100', '4500', '75000', '250000', '500000', '300000',
       '120000', '13800', '600000', '800000', '900000', '700000',
       '350000', '450000', '53000', '280000', '72000', '270000',
       '1300000', '22000', '135000', '9000', '290000', '44000', '36000',
       '168000', '11000', '16', '75', '1400', '4800', '2450', '6500',
 

In [118]:
import re

string = '5000s'

def number_cleaner(text):
    return re.findall(r'([-><+]\s*(\d+))&((\d+)[+s])', text)

number_cleaner(string)

[]

In [86]:
protests.dtypes

Year            object
Country         object
freedom         object
Protest_Name    object
Motivations     object
Peak_Size       object
dtype: object

In [93]:
protests[~protests.Peak_Size.str.contains("unknown")]

Unnamed: 0,Year,Country,freedom,Protest_Name,Motivations,Peak_Size
0,2019,Albania,Partly free,Local elections protests,Electoral fraud and corruption.,10
1,2019,Algeria,Not free,Protests against “Le Pouvoir”,"Economic downturn, corruption and nepotism, an...",1
2,2018,Argentina,Free,Austerity protests,"Rising inflation and unemployment, as well as ...",10
3,2020,Argentina,Free,Violence against women protests,Violence against women and lack of resources/i...,1000
4,2018,Armenia,Partly free,“#RejectSerzh” protests,"Democratic backslide, poverty, and corruption.",100
...,...,...,...,...,...,...
14506,2012,Papua New Guinea,Partly free,"political behavior, process",,1000
14507,2012,Papua New Guinea,Partly free,"political behavior, process",,1000
14508,2013,Papua New Guinea,Partly free,"political behavior, process",,2000
14509,2014,Papua New Guinea,Partly free,"political behavior, process",,100


In [96]:
protests['Peak_Size'] = protests['Peak_Size'].astype(int)

ValueError: invalid literal for int() with base 10: 'unknown'

In [None]:
# issues:
# the data for the carnegie dataset (years 2017 to 2020) is in different bases
# this happens after applying the number cleaner
## inside the protests column 'Peak_Size' there's an 'unknown' value I do not manage to get rid of
### i cannot turn column into integers, maybe due to the prior points 