After downloading the before_roe and after_roe csv files, this notebook is trying to do the following:
1. Read `wave2_annotated.csv` file containing every annotated tweet from ISI's dataset.
2. Open ISI's raw abortion dataset file (`isi_dataset_sampled.csv`)
3. Add an annotation column to every row.
4. Using id_str field, put the annotated label of the annotated rows (from the first file) in ISI's dataset. And empty annotation column for the rest.
5. The result file should have the following columns: `full_text`, `conversation_id_str`, `created_at`, `id_str`, `user`, `annotation`
6. Exporting the result `isi_dataset_sampled_with_annotations.csv` file

# Load ISI's sampled dataset and preprocess it

* Keep only the related columns
* Change the `dtypes`
* Save it to the `isi_dataset_sampled_modified.csv` file

In [1]:
import pandas as pd
import os

In [2]:
# Get parent directory
parent_dir = os.path.dirname(os.getcwd())

In [3]:
isi_dataset_sampled_path = os.path.join(parent_dir, 'raw', 'isi_dataset_sampled.csv')
isi_dataset_sampled_path

'/home/parsa/Codebases/GitHub_Repositories/USC-PSYC626-Abortion-Project/data/raw/isi_dataset_sampled.csv'

In [5]:
isi_dataset_sampled_modified_path = os.path.join(parent_dir, 'raw', 'isi_dataset_sampled_modified.csv')
isi_dataset_sampled_modified_path

'/home/parsa/Codebases/GitHub_Repositories/USC-PSYC626-Abortion-Project/data/raw/isi_dataset_sampled_modified.csv'

In [6]:
wave2_annotated_path = os.path.join(parent_dir, 'raw', 'abortion_sentiment_prediction', 'wave2_annotated.csv')
wave2_annotated_path

'/home/parsa/Codebases/GitHub_Repositories/USC-PSYC626-Abortion-Project/data/raw/abortion_sentiment_prediction/wave2_annotated.csv'

In [7]:
isi_dataset_sampled_df = pd.read_csv(isi_dataset_sampled_path)
isi_dataset_sampled_df.head()

  isi_dataset_sampled_df = pd.read_csv(isi_dataset_sampled_path)


Unnamed: 0,id,conversation_id,referenced_tweets.replied_to.id,referenced_tweets.retweeted.id,referenced_tweets.quoted.id,author_id,in_reply_to_user_id,retweeted_user_id,quoted_user_id,created_at,...,geo.geo.type,geo.id,geo.name,geo.place_id,geo.place_type,edit_history_tweet_ids,edit_controls.edits_remaining,edit_controls.editable_until,edit_controls.is_edit_eligible,year_month
0,1486109037959786497,1.486109e+18,,1.486018e+18,,1.363876e+18,,1.34606e+18,,2022-01-25 22:49:39+00:00,...,,,,,,['1486109037959786497'],5.0,2022-01-25T23:19:39.000Z,True,2022-01
1,1487572155407781888,1.487572e+18,,1.487418e+18,,1.296883e+18,,2963623000.0,,2022-01-29 23:43:34+00:00,...,,,,,,['1487572155407781889'],5.0,2022-01-30T00:13:34.000Z,True,2022-01
2,1484722644368855041,1.484723e+18,,1.484626e+18,,345237300.0,,27493880.0,,2022-01-22 03:00:37+00:00,...,,,,,,['1484722644368855041'],5.0,2022-01-22T03:30:37.000Z,True,2022-01
3,1486948918034243584,1.486949e+18,,,,1.411539e+18,,,,2022-01-28 06:27:02+00:00,...,,,,,,['1486948918034243585'],5.0,2022-01-28T06:57:02.000Z,True,2022-01
4,1484536666266218496,1.484537e+18,1.484537e+18,,,19414560.0,19414559.0,,,2022-01-21 14:41:37+00:00,...,,,,,,['1484536666266218498'],5.0,2022-01-21T15:11:37.000Z,True,2022-01


In [13]:
# Only keep the "full_text", "conversation_id", "conversation_id_str", "created_at", "id", "id_str", "user" columns
isi_dataset_sampled_df = isi_dataset_sampled_df[["text", "conversation_id", "created_at", "id", "author.id"]]
isi_dataset_sampled_df.head()

Unnamed: 0,text,conversation_id,created_at,id,author.id
0,Meet our new advisory group!\nThese exceptiona...,1.486109e+18,2022-01-25 22:49:39+00:00,1486109037959786497,1.363876e+18
1,@ashtonpittman Republicans don't get to weigh ...,1.487572e+18,2022-01-29 23:43:34+00:00,1487572155407781888,1.296883e+18
2,BREAKING: A new poll shows 69% of Americans OP...,1.484723e+18,2022-01-22 03:00:37+00:00,1484722644368855041,345237300.0
3,what value is there to the life I want to end?,1.486949e+18,2022-01-28 06:27:02+00:00,1486948918034243584,1.411539e+18
4,The other side enjoys government support in Sc...,1.484537e+18,2022-01-21 14:41:37+00:00,1484536666266218496,19414560.0


In [14]:
# Rename "text" to "full_text"
isi_dataset_sampled_df = isi_dataset_sampled_df.rename(columns={"text": "full_text", "author.id": "user_id"})
isi_dataset_sampled_df.head()

Unnamed: 0,full_text,conversation_id,created_at,id,user_id
0,Meet our new advisory group!\nThese exceptiona...,1.486109e+18,2022-01-25 22:49:39+00:00,1486109037959786497,1.363876e+18
1,@ashtonpittman Republicans don't get to weigh ...,1.487572e+18,2022-01-29 23:43:34+00:00,1487572155407781888,1.296883e+18
2,BREAKING: A new poll shows 69% of Americans OP...,1.484723e+18,2022-01-22 03:00:37+00:00,1484722644368855041,345237300.0
3,what value is there to the life I want to end?,1.486949e+18,2022-01-28 06:27:02+00:00,1486948918034243584,1.411539e+18
4,The other side enjoys government support in Sc...,1.484537e+18,2022-01-21 14:41:37+00:00,1484536666266218496,19414560.0


In [30]:
print(isi_dataset_sampled_df[['id']].isnull().any(axis=1).sum())
print(isi_dataset_sampled_df[['conversation_id']].isnull().any(axis=1).sum())
print(isi_dataset_sampled_df[['user_id']].isnull().any(axis=1).sum())
print(isi_dataset_sampled_df[['conversation_id', 'user_id']].isnull().all(axis=1).sum())

0
1134
1134
1134


In [31]:
# Drop rows having null values in both "conversation_id" and "user_id" columns
isi_dataset_sampled_df = isi_dataset_sampled_df.dropna(subset=['conversation_id', 'user_id'], how='all')
isi_dataset_sampled_df.head()

Unnamed: 0,full_text,conversation_id,created_at,id,user_id
0,Meet our new advisory group!\nThese exceptiona...,1.486109e+18,2022-01-25 22:49:39+00:00,1486109037959786497,1.363876e+18
1,@ashtonpittman Republicans don't get to weigh ...,1.487572e+18,2022-01-29 23:43:34+00:00,1487572155407781888,1.296883e+18
2,BREAKING: A new poll shows 69% of Americans OP...,1.484723e+18,2022-01-22 03:00:37+00:00,1484722644368855041,345237300.0
3,what value is there to the life I want to end?,1.486949e+18,2022-01-28 06:27:02+00:00,1486948918034243584,1.411539e+18
4,The other side enjoys government support in Sc...,1.484537e+18,2022-01-21 14:41:37+00:00,1484536666266218496,19414560.0


In [32]:
print(isi_dataset_sampled_df[['id']].isnull().any(axis=1).sum())
print(isi_dataset_sampled_df[['conversation_id']].isnull().any(axis=1).sum())
print(isi_dataset_sampled_df[['user_id']].isnull().any(axis=1).sum())
print(isi_dataset_sampled_df[['conversation_id', 'user_id']].isnull().all(axis=1).sum())

0
0
0
0


In [51]:
def unify_id_types(row):
        if isinstance(row, str):
            if 'e' in row:
                print(row)
            else:
                return str(row)
        elif isinstance(row, int):
            return str(row)
    
isi_dataset_sampled_df["id"] = isi_dataset_sampled_df["id"].apply(unify_id_types)

In [52]:
# Convert the "conversation_id" and user_id and id to string
isi_dataset_sampled_df["conversation_id"] = isi_dataset_sampled_df["conversation_id"].astype(int).astype(str)
isi_dataset_sampled_df["user_id"] = isi_dataset_sampled_df["user_id"].astype(int).astype(str)

# Rename "id" to "id_str", "conversation_id" to "conversation_id_str" and "user_id" to "user_id_str"
isi_dataset_sampled_df = isi_dataset_sampled_df.rename(columns={"id": "id_str", "conversation_id": "conversation_id_str", "user_id": "user_id_str"})
isi_dataset_sampled_df.head()

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user_id_str
0,Meet our new advisory group!\nThese exceptiona...,1486109037959786496,2022-01-25 22:49:39+00:00,1486109037959786497,1363876242924199936
1,@ashtonpittman Republicans don't get to weigh ...,1487572155407781888,2022-01-29 23:43:34+00:00,1487572155407781888,1296882535230775296
2,BREAKING: A new poll shows 69% of Americans OP...,1484722644368855040,2022-01-22 03:00:37+00:00,1484722644368855041,345237343
3,what value is there to the life I want to end?,1486948918034243584,2022-01-28 06:27:02+00:00,1486948918034243584,1411539200353112064
4,The other side enjoys government support in Sc...,1484536663695151104,2022-01-21 14:41:37+00:00,1484536666266218496,19414559


In [53]:
# Make the full_text, id_str and conversation_id_str columns strings
isi_dataset_sampled_df["full_text"] = isi_dataset_sampled_df["full_text"].astype(str)
isi_dataset_sampled_df["id_str"] = isi_dataset_sampled_df["id_str"].astype(str)
isi_dataset_sampled_df["conversation_id_str"] = isi_dataset_sampled_df["conversation_id_str"].astype(str)

In [54]:
# Are conversation_id_str unique?
print('conversation_id_str is unique:', isi_dataset_sampled_df["conversation_id_str"].is_unique)

conversation_id_str is unique: False


In [55]:
# Are id_str unique?
print('id_str is unique:', isi_dataset_sampled_df["id_str"].is_unique)

id_str is unique: True


In [59]:
# Are user_id_str unique?
print('user_id_str is unique:', isi_dataset_sampled_df["user_id_str"].is_unique)

user_id_str is unique: False


In [60]:
# Check for nulls in all columns
isi_dataset_sampled_df.isnull().sum()

full_text              0
conversation_id_str    0
created_at             0
id_str                 0
user_id_str            0
dtype: int64

In [57]:
isi_dataset_sampled_df.dtypes

full_text              object
conversation_id_str    object
created_at             object
id_str                 object
user_id_str            object
dtype: object

In [58]:
isi_dataset_sampled_df.to_csv(isi_dataset_sampled_modified_path, index=False)

# Load the modified version of the ISI's Dataset

In [62]:
isi_dataset_sampled_modified_df = pd.read_csv(isi_dataset_sampled_modified_path, engine='python', dtype={'full_text': 'str', 'conversation_id_str': 'str', 'created_at': 'object', 'id_str': 'str', 'user_id_str': 'str'})
isi_dataset_sampled_modified_df.head()

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user_id_str
0,Meet our new advisory group!\nThese exceptiona...,1486109037959786496,2022-01-25 22:49:39+00:00,1486109037959786497,1363876242924199936
1,@ashtonpittman Republicans don't get to weigh ...,1487572155407781888,2022-01-29 23:43:34+00:00,1487572155407781888,1296882535230775296
2,BREAKING: A new poll shows 69% of Americans OP...,1484722644368855040,2022-01-22 03:00:37+00:00,1484722644368855041,345237343
3,what value is there to the life I want to end?,1486948918034243584,2022-01-28 06:27:02+00:00,1486948918034243584,1411539200353112064
4,The other side enjoys government support in Sc...,1484536663695151104,2022-01-21 14:41:37+00:00,1484536666266218496,19414559


In [63]:
# Add a column to the mortezas_dataset_modified_df named "annotation" having empty strings
isi_dataset_sampled_modified_df["annotation"] = ""

# Load wave2_annotated file containing all of the annotated rows

In [64]:
wave2_annotated_df = pd.read_csv(wave2_annotated_path, engine='python', dtype={'full_text': 'str', 'annotation': 'str', 'id_str': 'str'})
wave2_annotated_df

Unnamed: 0,id_str,full_text,annotation
0,1550605036069298176,@LINYMurph @MrColionNoir I just wonder what wa...,Choice
1,1593281365491020032,Y‚Äôall care more distractions than the real iss...,Neutral
2,1562160346181296128,"@FortBendCounty Not so much inclusive, safe ab...",Choice
3,1554997349327536128,@jbarro Indeed: a much better strategy for pro...,Choice
4,1596126550357073920,@IrishAlexis00 Yeah sure you could have prior ...,Neutral
...,...,...,...
3384,1521948198754938880,"So if that life matters, why aren't you in sup...",Choice
3385,1512589588350611712,@bgclk \nWHAT QUALITY OF LIFE MATTERS TO YOU h...,Throw out
3386,1519892963052204032,"Now that, according to some acidemics, men can...",Neutral
3387,1513641546968567809,"Since I value life here on Earth, EXTREMELY üåäüíß...",Throw out


In [65]:
wave2_annotated_df["annotation"].value_counts()

Choice        1748
Life           557
Neutral        521
Throw out      474
Life            64
Throw Out       13
Unsure          12
Name: annotation, dtype: int64

# Checks before merging two files

In [66]:
# All of the rows in wave2_annotated_df came from isi_dataset_sampled_modified_df so let's check if the id_strs in wave2_annotated_df are in isi_dataset_sampled_modified_df
wave2_annotated_df["id_str"].isin(isi_dataset_sampled_modified_df["id_str"]).value_counts()

False    3011
True      378
Name: id_str, dtype: int64

# Merge two files

In [67]:
# All of the id_strs in wave1_annotated_df are in mortezas_dataset_modified_df so let's merge the two dataframes
merged_df = pd.merge(isi_dataset_sampled_modified_df, wave2_annotated_df, on="id_str", how="left", suffixes=('_isi', '_wave2'))
merged_df

Unnamed: 0,full_text_isi,conversation_id_str,created_at,id_str,user_id_str,annotation_isi,full_text_wave2,annotation_wave2
0,Meet our new advisory group!\nThese exceptiona...,1486109037959786496,2022-01-25 22:49:39+00:00,1486109037959786497,1363876242924199936,,,
1,@ashtonpittman Republicans don't get to weigh ...,1487572155407781888,2022-01-29 23:43:34+00:00,1487572155407781888,1296882535230775296,,,
2,BREAKING: A new poll shows 69% of Americans OP...,1484722644368855040,2022-01-22 03:00:37+00:00,1484722644368855041,345237343,,,
3,what value is there to the life I want to end?,1486948918034243584,2022-01-28 06:27:02+00:00,1486948918034243584,1411539200353112064,,,
4,The other side enjoys government support in Sc...,1484536663695151104,2022-01-21 14:41:37+00:00,1484536666266218496,19414559,,,
...,...,...,...,...,...,...,...,...
8283219,@therealityp @jefftheman058 @KurtSchlichter Ok...,1609588321851965440,2023-01-02 18:52:59+00:00,1609986129318625280,16161976,,,
8283220,@NFT_GOD @Daggo_ Loved this ‚ù§Ô∏è,1609913211930697728,2023-01-03 10:33:27+00:00,1610222805337112576,1522766332693000192,,,
8283221,What matters most in life are the beautiful me...,1609562962360172544,2023-01-01 14:51:29+00:00,1609562962360172544,593269960,,,
8283222,@desontour_paul @ValaAfshar I don‚Äôt need you i...,1609008817383133184,2023-01-01 03:37:00+00:00,1609393226342965248,308134564,,,


In [68]:
merged_df.columns

Index(['full_text_isi', 'conversation_id_str', 'created_at', 'id_str',
       'user_id_str', 'annotation_isi', 'full_text_wave2', 'annotation_wave2'],
      dtype='object')

In [69]:
# Drop "full_text_wave2", "annotation_isi"
merged_df = merged_df.drop(columns=["full_text_wave2", "annotation_isi"])

# Rename "full_text_isi" to "full_text" and "annotation_wave2" to "annotation"
merged_df = merged_df.rename(columns={"full_text_isi": "full_text", "annotation_wave2": "annotation"})

merged_df

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user_id_str,annotation
0,Meet our new advisory group!\nThese exceptiona...,1486109037959786496,2022-01-25 22:49:39+00:00,1486109037959786497,1363876242924199936,
1,@ashtonpittman Republicans don't get to weigh ...,1487572155407781888,2022-01-29 23:43:34+00:00,1487572155407781888,1296882535230775296,
2,BREAKING: A new poll shows 69% of Americans OP...,1484722644368855040,2022-01-22 03:00:37+00:00,1484722644368855041,345237343,
3,what value is there to the life I want to end?,1486948918034243584,2022-01-28 06:27:02+00:00,1486948918034243584,1411539200353112064,
4,The other side enjoys government support in Sc...,1484536663695151104,2022-01-21 14:41:37+00:00,1484536666266218496,19414559,
...,...,...,...,...,...,...
8283219,@therealityp @jefftheman058 @KurtSchlichter Ok...,1609588321851965440,2023-01-02 18:52:59+00:00,1609986129318625280,16161976,
8283220,@NFT_GOD @Daggo_ Loved this ‚ù§Ô∏è,1609913211930697728,2023-01-03 10:33:27+00:00,1610222805337112576,1522766332693000192,
8283221,What matters most in life are the beautiful me...,1609562962360172544,2023-01-01 14:51:29+00:00,1609562962360172544,593269960,
8283222,@desontour_paul @ValaAfshar I don‚Äôt need you i...,1609008817383133184,2023-01-01 03:37:00+00:00,1609393226342965248,308134564,


In [70]:
merged_df["annotation"].value_counts()

Choice        176
Throw out     104
Life           40
Neutral        40
Life           15
Unsure          2
Throw Out       1
Name: annotation, dtype: int64

In [71]:
# Make sure "full_text" "conversation_id_std", "id_str", "annotation" are strings
merged_df["full_text"] = merged_df["full_text"].astype(str)
merged_df["conversation_id_str"] = merged_df["conversation_id_str"].astype(str)
merged_df["id_str"] = merged_df["id_str"].astype(str)
merged_df["annotation"] = merged_df["annotation"].astype(str)

merged_df.dtypes

full_text              object
conversation_id_str    object
created_at             object
id_str                 object
user_id_str            object
annotation             object
dtype: object

In [72]:
# Reset the index
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,full_text,conversation_id_str,created_at,id_str,user_id_str,annotation
0,Meet our new advisory group!\nThese exceptiona...,1486109037959786496,2022-01-25 22:49:39+00:00,1486109037959786497,1363876242924199936,
1,@ashtonpittman Republicans don't get to weigh ...,1487572155407781888,2022-01-29 23:43:34+00:00,1487572155407781888,1296882535230775296,
2,BREAKING: A new poll shows 69% of Americans OP...,1484722644368855040,2022-01-22 03:00:37+00:00,1484722644368855041,345237343,
3,what value is there to the life I want to end?,1486948918034243584,2022-01-28 06:27:02+00:00,1486948918034243584,1411539200353112064,
4,The other side enjoys government support in Sc...,1484536663695151104,2022-01-21 14:41:37+00:00,1484536666266218496,19414559,
...,...,...,...,...,...,...
8283219,@therealityp @jefftheman058 @KurtSchlichter Ok...,1609588321851965440,2023-01-02 18:52:59+00:00,1609986129318625280,16161976,
8283220,@NFT_GOD @Daggo_ Loved this ‚ù§Ô∏è,1609913211930697728,2023-01-03 10:33:27+00:00,1610222805337112576,1522766332693000192,
8283221,What matters most in life are the beautiful me...,1609562962360172544,2023-01-01 14:51:29+00:00,1609562962360172544,593269960,
8283222,@desontour_paul @ValaAfshar I don‚Äôt need you i...,1609008817383133184,2023-01-01 03:37:00+00:00,1609393226342965248,308134564,


In [73]:
merged_df.to_csv(os.path.join(parent_dir, 'raw', 'isi_dataset_sampled_with_annotations.csv'), index=False)