After downloading the big 15GB file which contains the whole data for Morteza's dataset, this notebook is trying to do the following:
1. Read `wave1_annotated.csv` file containing every annotated tweet from Morteza's dataset.
2. Open Morteza's raw complete abortion dataset file (mortezas_dataset_complete.csv)
3. Add an annotation column to every row.
4. Using id_str field, put the annotated label of the annotated rows (from the first file) in Morteza's dataset. And empty annotation column for the rest.
5. The result file should have the following columns: `full_text`, `conversation_id_str`, `created_at`, `id_str`, `user`, `annotation`
6. Exporting the result `mortezas_dataset_complete_with_annotations.csv` file
The resulting file should have around 3 million tweets.

# Load Morteza's complete dataset and preprocess it

* Keep only the related columns
* Change the `dtypes`
* Save it to the `mortezas_dataset_modified.csv` file

In [1]:
import pandas as pd
import os

In [2]:
# Get parent directory
parent_dir = os.path.dirname(os.getcwd())
mortezas_dataset_complete_path = os.path.join(parent_dir, 'raw', 'mortezas_dataset_complete.csv')
mortezas_dataset_complete_path

'/home/parsa/Codebases/GitHub_Repositories/USC-PSYC626-Abortion-Project/data/raw/mortezas_dataset_complete.csv'

In [20]:
mortezas_dataset_modified_path = os.path.join(parent_dir, 'raw', 'mortezas_dataset_modified.csv')
mortezas_dataset_modified_path

'/home/parsa/Codebases/GitHub_Repositories/USC-PSYC626-Abortion-Project/data/raw/mortezas_dataset_modified.csv'

In [3]:
wave1_annotated_path = os.path.join(parent_dir, 'raw', 'abortion_sentiment_prediction', 'wave1_annotated.csv')
wave1_annotated_path

'/home/parsa/Codebases/GitHub_Repositories/USC-PSYC626-Abortion-Project/data/raw/abortion_sentiment_prediction/wave1_annotated.csv'

In [4]:
mortezas_dataset_complete_df = pd.read_csv(mortezas_dataset_complete_path)
mortezas_dataset_complete_df.head()

  mortezas_dataset_complete_df = pd.read_csv(mortezas_dataset_complete_path)


Unnamed: 0,_id,contributors,conversation_id,conversation_id_str,coordinates,created_at,display_text_range,entities,favorite_count,favorited,...,quoted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,self_thread,withheld_in_countries,scopes,withheld_scope,withheld_copyright,conversation_control
0,ObjectId(65287c8bb9dceaa49b99cea4),,550441533166456832,550441533166456832,,Thu Jan 01 00:01:02 +0000 2015,"[0,61]","{""hashtags"":[{""indices"":[6,16],""text"":""prochoi...",2,False,...,,,,,,,,,,
1,ObjectId(65287c8bb9dceaa49b99cea5),,550442364456558592,550442364456558592,,Thu Jan 01 00:04:21 +0000 2015,"[0,99]","{""hashtags"":[{""indices"":[52,61],""text"":""aborti...",0,False,...,,,,,,,,,,
2,ObjectId(65287c8bb9dceaa49b99cea6),,550443662664929281,550443662664929281,,Thu Jan 01 00:09:30 +0000 2015,"[0,113]","{""hashtags"":[{""indices"":[26,35],""text"":""aborti...",0,False,...,,,,,,,,,,
3,ObjectId(65287c8bb9dceaa49b99cea7),,550428829618286592,550428829618286592,,Thu Jan 01 00:10:50 +0000 2015,"[0,132]","{""hashtags"":[{""indices"":[26,44],""text"":""Planne...",0,False,...,,,,,,,,,,
4,ObjectId(65287c8bb9dceaa49b99cea8),,550444672221724672,550444672221724672,,Thu Jan 01 00:13:31 +0000 2015,"[0,139]","{""hashtags"":[{""indices"":[0,8],""text"":""ProLife""...",0,False,...,,,,,,,,,,


In [5]:
# Only keep the "full_text", "conversation_id", "conversation_id_str", "created_at", "id", "id_str", "user" columns
mortezas_dataset_complete_df = mortezas_dataset_complete_df[["full_text", "conversation_id_str", "created_at", "id_str", "user"]]
mortezas_dataset_complete_df.head()

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user
0,Happy #prochoice New Year. #HumanRights for wo...,550441533166456832,Thu Jan 01 00:01:02 +0000 2015,550441533166456832,"{""advertiser_account_service_levels"":[],""adver..."
1,Tubal (ectopic) pregnancies increase by 30% af...,550442364456558592,Thu Jan 01 00:04:21 +0000 2015,550442364456558592,"{""advertiser_account_service_levels"":[],""adver..."
2,Nicki Minaj discusses the #abortion she refere...,550443662664929281,Thu Jan 01 00:09:30 +0000 2015,550443662664929281,"{""advertiser_account_service_levels"":[],""adver..."
3,@pmbasse @WendyDavisTexas #PlannedParenthood r...,550428829618286592,Thu Jan 01 00:10:50 +0000 2015,550443998662647811,"{""advertiser_account_service_levels"":[""analyti..."
4,#ProLife #Perspective On This Story: @NickiMin...,550444672221724672,Thu Jan 01 00:13:31 +0000 2015,550444672221724672,"{""advertiser_account_service_levels"":[""smb""],""..."


In [6]:
# Make the full_text, id_str and conversation_id_str columns strings
mortezas_dataset_complete_df["full_text"] = mortezas_dataset_complete_df["full_text"].astype(str)
mortezas_dataset_complete_df["id_str"] = mortezas_dataset_complete_df["id_str"].astype(str)
mortezas_dataset_complete_df["conversation_id_str"] = mortezas_dataset_complete_df["conversation_id_str"].astype(str)

In [7]:
# Are conversation_id_str unique?
print('conversation_id_str is unique:', mortezas_dataset_complete_df["conversation_id_str"].is_unique)

conversation_id_str is unique: False


In [8]:
# Are id_str unique?
print('id_str is unique:', mortezas_dataset_complete_df["id_str"].is_unique)

id_str is unique: True


In [9]:
# Check for nulls in all columns
mortezas_dataset_complete_df.isnull().sum()

full_text              0
conversation_id_str    0
created_at             0
id_str                 0
user                   0
dtype: int64

In [16]:
mortezas_dataset_complete_df.dtypes

full_text              object
conversation_id_str    object
created_at             object
id_str                 object
user                   object
dtype: object

In [21]:
mortezas_dataset_complete_df.to_csv(mortezas_dataset_modified_path, index=False)

# Load the modified version of the Morteza's Dataset

In [22]:
mortezas_dataset_modified_df = pd.read_csv(mortezas_dataset_modified_path, engine='python', dtype={'full_text': 'str', 'conversation_id_str': 'str', 'created_at': 'object', 'id_str': 'str', 'user': 'object'})
mortezas_dataset_modified_df.head()

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user
0,Happy #prochoice New Year. #HumanRights for wo...,550441533166456832,Thu Jan 01 00:01:02 +0000 2015,550441533166456832,"{""advertiser_account_service_levels"":[],""adver..."
1,Tubal (ectopic) pregnancies increase by 30% af...,550442364456558592,Thu Jan 01 00:04:21 +0000 2015,550442364456558592,"{""advertiser_account_service_levels"":[],""adver..."
2,Nicki Minaj discusses the #abortion she refere...,550443662664929281,Thu Jan 01 00:09:30 +0000 2015,550443662664929281,"{""advertiser_account_service_levels"":[],""adver..."
3,@pmbasse @WendyDavisTexas #PlannedParenthood r...,550428829618286592,Thu Jan 01 00:10:50 +0000 2015,550443998662647811,"{""advertiser_account_service_levels"":[""analyti..."
4,#ProLife #Perspective On This Story: @NickiMin...,550444672221724672,Thu Jan 01 00:13:31 +0000 2015,550444672221724672,"{""advertiser_account_service_levels"":[""smb""],""..."


In [23]:
# Add a column to the mortezas_dataset_modified_df named "annotation" having empty strings
mortezas_dataset_modified_df["annotation"] = ""

# Load wave1_annotated file containing all of the annotated rows

In [27]:
wave1_annotated_df = pd.read_csv(wave1_annotated_path, engine='python', dtype={'full_text': 'str', 'annotation': 'str', 'id_str': 'str'})
wave1_annotated_df

Unnamed: 0,full_text,annotation,id_str
0,@AdamParkhomenko #VoteBlueToEndThisNightmare #...,Choice,1278167104257011713
1,I was IN @NOW VP; I am complicit too--I'm not ...,Choice,1275697176879009799
2,@multialannaxo So true my mom did take Abortio...,Choice,1287868884200984576
3,@kdmport @TexasTribune This üëÜüèΩ is how we do it...,Choice,1242306252883066881
4,@elisa1121 Why does anyone think they have the...,Choice,1307448792032124934
...,...,...,...
2243,Ganz viel Solidarit√§t f√ºr #KristinaH√§nel nach...,Throw out,934061192162750464
2244,Planned Parenthood's (#repealthe8th donors) #p...,Choice,864891323664519170
2245,"""If you cease to care about Black lives after ...",Choice,919387018001616896
2246,Bericht einer schwangeren Person - √ºber ein (v...,Throw out,828578603386429440


In [28]:
wave1_annotated_df["annotation"].value_counts()

Life          920
Choice        852
Neutral       278
Throw out     111
Life           87
Name: annotation, dtype: int64

# Checks before merging two files

In [26]:
# All of the rows in wave1_annotated_df came from mortezas_dataset_modified_df so let's check if the id_strs in wave1_annotated_df are in mortezas_dataset_modified_df
wave1_annotated_df["id_str"].isin(mortezas_dataset_modified_df["id_str"]).value_counts()

True    2248
Name: id_str, dtype: int64

# Merge two files

In [34]:
# All of the id_strs in wave1_annotated_df are in mortezas_dataset_modified_df so let's merge the two dataframes
merged_df = pd.merge(mortezas_dataset_modified_df, wave1_annotated_df, on="id_str", how="left", suffixes=('_mortezas', '_wave1'))
merged_df

Unnamed: 0,full_text_mortezas,conversation_id_str,created_at,id_str,user,annotation_mortezas,full_text_wave1,annotation_wave1
0,Happy #prochoice New Year. #HumanRights for wo...,550441533166456832,Thu Jan 01 00:01:02 +0000 2015,550441533166456832,"{""advertiser_account_service_levels"":[],""adver...",,,
1,Tubal (ectopic) pregnancies increase by 30% af...,550442364456558592,Thu Jan 01 00:04:21 +0000 2015,550442364456558592,"{""advertiser_account_service_levels"":[],""adver...",,,
2,Nicki Minaj discusses the #abortion she refere...,550443662664929281,Thu Jan 01 00:09:30 +0000 2015,550443662664929281,"{""advertiser_account_service_levels"":[],""adver...",,,
3,@pmbasse @WendyDavisTexas #PlannedParenthood r...,550428829618286592,Thu Jan 01 00:10:50 +0000 2015,550443998662647811,"{""advertiser_account_service_levels"":[""analyti...",,,
4,#ProLife #Perspective On This Story: @NickiMin...,550444672221724672,Thu Jan 01 00:13:31 +0000 2015,550444672221724672,"{""advertiser_account_service_levels"":[""smb""],""...",,,
...,...,...,...,...,...,...,...,...
2693004,@maadinochka @ExmuslimsOrg I'm totally OK with...,1313556155281616898,Tue Oct 06 23:58:27 +0000 2020,1313629726515359746,"{""advertiser_account_service_levels"":[],""adver...",,,
2693005,This is what a so-called #prolife administrati...,1313629761063641088,Tue Oct 06 23:58:35 +0000 2020,1313629761063641088,"{""advertiser_account_service_levels"":[""analyti...",,,
2693006,@DonaldTrump\nHow's your pro-life base feel ab...,1313629892307812353,Tue Oct 06 23:59:07 +0000 2020,1313629892307812353,"{""advertiser_account_service_levels"":[""analyti...",,,
2693007,Similar to Biden‚Äôs promise of federal legislat...,1313629896900571137,Tue Oct 06 23:59:08 +0000 2020,1313629896900571137,"{""advertiser_account_service_levels"":[""media_s...",,,


In [35]:
merged_df.columns

Index(['full_text_mortezas', 'conversation_id_str', 'created_at', 'id_str',
       'user', 'annotation_mortezas', 'full_text_wave1', 'annotation_wave1'],
      dtype='object')

In [36]:
# Drop "full_text_wave1", "annotation_mortezas"
merged_df = merged_df.drop(columns=["full_text_wave1", "annotation_mortezas"])

# Rename "full_text_mortezas" to "full_text" and "annotation_wave1" to "annotation"
merged_df = merged_df.rename(columns={"full_text_mortezas": "full_text", "annotation_wave1": "annotation"})

merged_df

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user,annotation
0,Happy #prochoice New Year. #HumanRights for wo...,550441533166456832,Thu Jan 01 00:01:02 +0000 2015,550441533166456832,"{""advertiser_account_service_levels"":[],""adver...",
1,Tubal (ectopic) pregnancies increase by 30% af...,550442364456558592,Thu Jan 01 00:04:21 +0000 2015,550442364456558592,"{""advertiser_account_service_levels"":[],""adver...",
2,Nicki Minaj discusses the #abortion she refere...,550443662664929281,Thu Jan 01 00:09:30 +0000 2015,550443662664929281,"{""advertiser_account_service_levels"":[],""adver...",
3,@pmbasse @WendyDavisTexas #PlannedParenthood r...,550428829618286592,Thu Jan 01 00:10:50 +0000 2015,550443998662647811,"{""advertiser_account_service_levels"":[""analyti...",
4,#ProLife #Perspective On This Story: @NickiMin...,550444672221724672,Thu Jan 01 00:13:31 +0000 2015,550444672221724672,"{""advertiser_account_service_levels"":[""smb""],""...",
...,...,...,...,...,...,...
2693004,@maadinochka @ExmuslimsOrg I'm totally OK with...,1313556155281616898,Tue Oct 06 23:58:27 +0000 2020,1313629726515359746,"{""advertiser_account_service_levels"":[],""adver...",
2693005,This is what a so-called #prolife administrati...,1313629761063641088,Tue Oct 06 23:58:35 +0000 2020,1313629761063641088,"{""advertiser_account_service_levels"":[""analyti...",
2693006,@DonaldTrump\nHow's your pro-life base feel ab...,1313629892307812353,Tue Oct 06 23:59:07 +0000 2020,1313629892307812353,"{""advertiser_account_service_levels"":[""analyti...",
2693007,Similar to Biden‚Äôs promise of federal legislat...,1313629896900571137,Tue Oct 06 23:59:08 +0000 2020,1313629896900571137,"{""advertiser_account_service_levels"":[""media_s...",


In [37]:
merged_df["annotation"].value_counts()

Life          920
Choice        852
Neutral       278
Throw out     111
Life           87
Name: annotation, dtype: int64

In [39]:
# Make sure "full_text" "conversation_id_std", "id_str", "annotation" are strings
merged_df["full_text"] = merged_df["full_text"].astype(str)
merged_df["conversation_id_str"] = merged_df["conversation_id_str"].astype(str)
merged_df["id_str"] = merged_df["id_str"].astype(str)
merged_df["annotation"] = merged_df["annotation"].astype(str)

merged_df.dtypes

full_text              object
conversation_id_str    object
created_at             object
id_str                 object
user                   object
annotation             object
dtype: object

In [41]:
# Reset the index
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user,annotation
0,Happy #prochoice New Year. #HumanRights for wo...,550441533166456832,Thu Jan 01 00:01:02 +0000 2015,550441533166456832,"{""advertiser_account_service_levels"":[],""adver...",
1,Tubal (ectopic) pregnancies increase by 30% af...,550442364456558592,Thu Jan 01 00:04:21 +0000 2015,550442364456558592,"{""advertiser_account_service_levels"":[],""adver...",
2,Nicki Minaj discusses the #abortion she refere...,550443662664929281,Thu Jan 01 00:09:30 +0000 2015,550443662664929281,"{""advertiser_account_service_levels"":[],""adver...",
3,@pmbasse @WendyDavisTexas #PlannedParenthood r...,550428829618286592,Thu Jan 01 00:10:50 +0000 2015,550443998662647811,"{""advertiser_account_service_levels"":[""analyti...",
4,#ProLife #Perspective On This Story: @NickiMin...,550444672221724672,Thu Jan 01 00:13:31 +0000 2015,550444672221724672,"{""advertiser_account_service_levels"":[""smb""],""...",
...,...,...,...,...,...,...
2693004,@maadinochka @ExmuslimsOrg I'm totally OK with...,1313556155281616898,Tue Oct 06 23:58:27 +0000 2020,1313629726515359746,"{""advertiser_account_service_levels"":[],""adver...",
2693005,This is what a so-called #prolife administrati...,1313629761063641088,Tue Oct 06 23:58:35 +0000 2020,1313629761063641088,"{""advertiser_account_service_levels"":[""analyti...",
2693006,@DonaldTrump\nHow's your pro-life base feel ab...,1313629892307812353,Tue Oct 06 23:59:07 +0000 2020,1313629892307812353,"{""advertiser_account_service_levels"":[""analyti...",
2693007,Similar to Biden‚Äôs promise of federal legislat...,1313629896900571137,Tue Oct 06 23:59:08 +0000 2020,1313629896900571137,"{""advertiser_account_service_levels"":[""media_s...",


In [42]:
merged_df.to_csv(os.path.join(parent_dir, 'raw', 'mortezas_dataset_complete_with_annotations.csv'), index=False)