In [22]:
import re
import pandas as pd
pd.set_option('display.max_colwidth', None)
from tqdm import tqdm
tqdm.pandas()

In [3]:
# read the conservative subset of the corpus
df = pd.read_csv("../data/appropriateness-corpus/appropriateness_corpus_conservative.csv")

In [4]:
df.count()

post_id                    2191
source_dataset             2191
issue                      2191
post_text                  2191
Inappropriateness          2191
Toxic Emotions             2191
Excessive Intensity        2191
Emotional Deception        2191
Missing Commitment         2191
Missing Seriousness        2191
Missing Openness           2191
Missing Intelligibility    2191
Unclear Meaning            2191
Missing Relevance          2191
Confusing Reasoning        2191
Other Reasons              2191
Detrimental Orthography    2191
Reason Unclassified        2191
dtype: int64

In [5]:
# describe the dataset
df.describe()

Unnamed: 0,post_id,source_dataset,Inappropriateness,Toxic Emotions,Excessive Intensity,Emotional Deception,Missing Commitment,Missing Intelligibility,Unclear Meaning,Missing Relevance,Confusing Reasoning,Other Reasons,Detrimental Orthography,Reason Unclassified
count,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0,2191.0
mean,1095.0,0.857599,0.53948,0.271109,0.183478,0.194888,0.335463,0.353263,0.209493,0.231858,0.079416,0.049293,0.035144,0.014605
std,632.631541,0.943512,0.498553,0.444634,0.387146,0.396205,0.47226,0.478093,0.40704,0.422115,0.270448,0.216528,0.184185,0.119994
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,547.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1095.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1642.5,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2190.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# select rows where 'Inappropriateness' is either 0 and 1; then, create another column 'Label' with the values appropriate and inappropriate based on the value of 'Inappropriateness'
df = df[df['Inappropriateness'].isin([0, 1])]
df['Label'] = df['Inappropriateness'].apply(lambda x: 'appropriate' if x == 0 else 'inappropriate')
df = df.drop(['source_dataset', 'Inappropriateness'], axis=1)

In [7]:
df.count()

post_id                    2191
issue                      2191
post_text                  2191
Toxic Emotions             2191
Excessive Intensity        2191
Emotional Deception        2191
Missing Commitment         2191
Missing Seriousness        2191
Missing Openness           2191
Missing Intelligibility    2191
Unclear Meaning            2191
Missing Relevance          2191
Confusing Reasoning        2191
Other Reasons              2191
Detrimental Orthography    2191
Reason Unclassified        2191
Label                      2191
dtype: int64

In [8]:
# rename post_text to argument
df = df.rename(columns={'post_text': 'argument'})
df.sample(4)

Unnamed: 0,post_id,issue,argument,Toxic Emotions,Excessive Intensity,Emotional Deception,Missing Commitment,Missing Seriousness,Missing Openness,Missing Intelligibility,Unclear Meaning,Missing Relevance,Confusing Reasoning,Other Reasons,Detrimental Orthography,Reason Unclassified,Label
58,58,India has the potential to lead the world:,"I am an American, and I am ashamed to admit ""Merica is rich due to slavery. India never had slavery and used to be rich until the British came and took everything away. We were left in a bad state, but started developing quickly. India is much more competitive in education than America and soon when everyone has education (which will happen in our lifetime) India will excel.\r\nIndia has the potential.",1,1,0,0,False,False,1,0,1,0,0,0,0,inappropriate
1738,1738,India has the potential to lead the world:,"i think, the india is not ready to lead the world because of many reasons..\r\n1.) the politians of our country are involved in black money cases. they try to gain or earn money as possible.\r\n2.) they always try to make a big issue of useless topics. e.g., if a politian says something about another politian, they try to make it a big issue .",1,1,0,1,False,True,0,0,0,0,0,0,0,inappropriate
108,108,Why ladies bars have been shut down in india?:,"You r wrong my dear they havent been shut down in India, the ban is only in Maharastra. The Gov had shut down because the gov had received many a compliants from the housewifes about their male familly members wasting money on ladies bars and not looking after the families. Also many criminal activities took place at this bars. U must be knowing the tarannum case, or telgi case where tarannum monthly income was arouned 1 lac and telgi would throw away nereby 1 lac per day on the bar girls. To put a stop on shuch type of criminal activities the Gov of Maharastra had put a ban on these ladies bar",1,0,1,0,False,False,0,0,0,0,0,0,0,inappropriate
1196,1196,Do you think that people who are on drugs and commit a crime should be considered for pleas or fully prosecute:,"First drugs are illegal so a punishment goes there.\r\nUnder the use of drugs an act of crime occurs, may not have if accused was clean.\r\nConsider priors he/she may have.\r\nThen put together a jail term and a drug rehab program.\r\nI would suggest a 3 to 5 year sentence and less time for good behavior. \r\nYes you are right about a killing being less time. That really stinks.",0,0,0,0,False,False,0,0,0,0,0,0,0,appropriate


In [9]:
# if Label is inappropriate, then take columns with values 1 and put their names in a list called Reasons
df['Reasons'] = df.apply(lambda x: [col for col in df.columns if x[col] == 1 and col not in ['Label', 'post_id', 'issue', 'argument']], axis=1)

In [10]:
df[['issue', 'argument', 'Label', 'Reasons']].sample(4)

Unnamed: 0,issue,argument,Label,Reasons
625,"Business review: 5.0 stars\r\nbusiness name: new way gourmet. city: pittsburgh. categories: mediterranean, sandwiches, delis, restaurants, pizza:","I can't sing new way's praises enough. I live a hop, skip and a jump away from this gem and checked it out for the first time after some Arsenal workers recommended it on a night they didn't have a food truck. The owner is super friendly, makes everything from scratch and gives you substantial portions at an extremely low price. The place is never too crowded so he is always able to whip up your delicious meal in no time whilst you window gaze at the busy strip of Penn Avenue outside. The tuscan chicken sandwich was awesome as well as the hot italian sausage sandwich when it is on special. You really can't go wrong with anything you get there, though. Definitely worth checking out if you are in the neighborhood!",appropriate,[]
1839,Cmv: these sanctions will help russia in the long run.:,"A long while ago Putin ordered all his government officials to close offshore accounts. He ordered all government officials to buy Russian made cars. And wealthy Russians use other countries to skate around Russian taxes. Now the US and EU are sanctioning business and people (I try to keep up on all the sanctions but I lose track). Europe buys from Russia much needed gas, and in turn, Russia buys things from Europe. They have a nice little ecosystem. So I believe, not only will this hurt Europe more so in the long run, but help Russia. Because Putin may use this to reign in his corrupt officials and tax skating billionaires. Don't rely on overseas business and don't use tax havens, because they will get sanctioned or frozen. \r\n\r\nThis is just my opinion, or view. I'd like to hear others. Thanks.",appropriate,[]
2091,Human growth and development should parents use spanking as an option to discipline:,"I think that children should get spanked once in a while. I got spanked when I was little but I could definitely say that I did learn lessons. Don't spank your kids though with whatever you find, you don't want to kill them !!",appropriate,[]
1181,Why do guilty cops not go to jail?:,"It happens all over the world, even bad countries aren't held accountable.\r\n\r\nSuch as israel for instance in its occupation of Palestinian lands since 1967. Israel was caught breaking the law (occupying another people's land) why is it that it was protected by the US (through Veto Power) and not militarily kicked out like any other normal country would have been? (eg. Iraq out of Kuwait etc.)\r\n\r\nSo in conclusion, the proper answer to your good question would be that the world isn't fair and just because some people have GUNS and ""protection"", (like some states have BIG GUNS and ""protection""), the law is lenient with them.\r\n\r\nLuv ya! :)\r\n\r\nBye Bye",inappropriate,"[Missing Commitment, Missing Openness, Missing Intelligibility, Unclear Meaning, Missing Relevance]"


In [11]:
# count the number of inappropriate and appropriate arguments
df['Label'].value_counts()

Label
inappropriate    1182
appropriate      1009
Name: count, dtype: int64

In [12]:
inappropriate = df[df['Label'] == 'inappropriate'][df['Reasons'].apply(lambda x: len(x) > 0)]

  inappropriate = df[df['Label'] == 'inappropriate'][df['Reasons'].apply(lambda x: len(x) > 0)]


In [13]:
inappropriate.count()

post_id                    1182
issue                      1182
argument                   1182
Toxic Emotions             1182
Excessive Intensity        1182
Emotional Deception        1182
Missing Commitment         1182
Missing Seriousness        1182
Missing Openness           1182
Missing Intelligibility    1182
Unclear Meaning            1182
Missing Relevance          1182
Confusing Reasoning        1182
Other Reasons              1182
Detrimental Orthography    1182
Reason Unclassified        1182
Label                      1182
Reasons                    1182
dtype: int64

In [14]:
inappropriate.to_csv('../data/appropriateness-corpus/inappropriate_with_reasons_conservative.csv', index=False)

In [25]:
i_df = pd.read_csv('../data/appropriateness-corpus/inappropriate_with_reasons_conservative.csv')

In [26]:
i_df[['issue', 'argument', 'Label', 'Reasons']].sample(4)

Unnamed: 0,issue,argument,Label,Reasons
1070,"Cmv:""if you have the strength to kill yourself, you have the strength to continue living."":","I believe that if a person can suicide, they can continue living. It's stupid to do such a thing when you are willful enough to consider or even act on upon it. It really doesnt make any sense why you rather take your own life then live when you are so capable of the former. Suicide is not an option i find necessary and that a person who commit are stupid. So, topic is ""if you have the strength to kill yourself, you have the strength to live on"". Change my view over this topic as I am open to all responses from you all.",inappropriate,"['Toxic Emotions', 'Excessive Intensity', 'Emotional Deception', 'Missing Commitment', 'Missing Openness']"
935,If your spouse committed murder and he or she confided in you would you turn them in:,"If I do that, then who's going to cook and clean house? Who's gonna have sex with me? I'm too old to do the dating scene all over again. Besides, you don't specify who she killed or why. Maybe she had her reasons. Not everyone deserves to live, you know? ;)",inappropriate,"['Toxic Emotions', 'Emotional Deception', 'Missing Commitment', 'Missing Seriousness']"
28,Is the school uniform a good or bad idea:,"It is a good thing because this shows that the school and everyone else accepts each other no matter where they come from, who they are or what religion they are.",inappropriate,"['Missing Intelligibility', 'Unclear Meaning', 'Confusing Reasoning']"
85,Which is a better pizza or taco bell stance: pizza:,"I don't know where you live, but here in SoCal we have pizza delivery services availabe until 3am, plus i'd rather go to Del Taco anytime over Taco Smell or Taco Hell aka Taco Bell.\r\nBack in the 60s before Taco Fell became part of another multi-national corporation they were good, in our high school days we go there to cure the munchies, tacos were 5 for a buck back then.\r\nJJ",inappropriate,"['Toxic Emotions', 'Excessive Intensity', 'Missing Commitment', 'Missing Seriousness']"


In [38]:
full_agreement_df = pd.read_csv('../data/appropriateness-corpus/inappropriate_with_reasons_full_agreement.csv')

In [27]:
# check for length of at least 200 words
def get_word_count(text):
    return len(re.findall(r'\w+', text))

In [28]:
i_df['word_count'] = i_df['argument'].progress_apply(get_word_count)

100%|██████████| 1182/1182 [00:00<00:00, 17136.77it/s]


In [29]:
i_df[['issue', 'argument', 'Label', 'Reasons', 'word_count']].sample(4)

Unnamed: 0,issue,argument,Label,Reasons,word_count
69,Have u.s. landowners contributed to native american land lost from illegal government seizure of native land?:,"First off the ancient rite of war that every nation has lived by including the Lakota, which removed the Nation that was here before them, the victor of war gets the land. While there was some chicanery and our forefathers only really won a hand full of battle the Indian Wars were more a war of attrition and the Nations lost the land due to that fact.\r\n\r\nYes, we have contributed to the land, there are more trees than were on the Continent at the time of Columbus's landing. We grow most of the food uses to survive.",inappropriate,"['Toxic Emotions', 'Emotional Deception', 'Missing Commitment', 'Missing Seriousness', 'Missing Intelligibility', 'Missing Relevance']",99
893,Do you think people who make bad decisions and get into trouble can turn their life around or are they labeled:,"I think by your name you may be a Marine. Crimes are crimes, and some, can be forgiven with hard work and a real change. I am an NCO and I have seen troops grow up and take their duty and the rules seriously. You can change, even if you get discharged. You can ALWAYS choose the right path, just hope it is not to late, like for Tookie Williams. Chose to change too late.",inappropriate,"['Missing Commitment', 'Missing Openness', 'Missing Intelligibility', 'Missing Relevance']",75
812,Help me please?:,"ethically, if your fathers insurance goes up you should pay it. it will probably get reviewed with in six months. however the good news is this is just lifes lessons learned. you got a ticket for improper passing, no big deal. she got a ticket because she was half to blame at least. lets assume there is a big doctor bill. you are probably insured on your fathers policy had you been at fault. your friend has the problem and he is totally at fault. he let you use the car. now if there is a big clain he is not insured because he didn't list you as a separate driver and he might have to pay out of his pocket. this is his problem not yours. i don't see this as an ethical problem for you because you are insured.",inappropriate,"['Missing Intelligibility', 'Missing Relevance']",143
1150,Is incest big issue in us?:,"Well, first off, incest is not always ""child abuse."" Sex between closely related CONSENTING adults probably happens. \r\n\r\nIn the USA, incest is not common. There is a stereotype about the south, that it happens there a lot. This is untrue. In fact, the highest rates of incest in the USA are found in among hispanics. \r\n\r\nFurthermore, incest is RAMPANT in Central and South America. \r\n\r\nLove Jack",inappropriate,"['Toxic Emotions', 'Excessive Intensity', 'Missing Commitment', 'Missing Openness']",66


In [39]:
full_agreement_df['word_count'] = full_agreement_df['argument'].progress_apply(get_word_count)

100%|██████████| 214/214 [00:00<00:00, 44472.13it/s]


In [40]:
full_agreement_df[full_agreement_df['word_count']>=100].count()

post_id                    31
issue                      31
argument                   31
Toxic Emotions             31
Excessive Intensity        31
Emotional Deception        31
Missing Commitment         31
Missing Seriousness        31
Missing Openness           31
Missing Intelligibility    31
Unclear Meaning            31
Missing Relevance          31
Confusing Reasoning        31
Other Reasons              31
Detrimental Orthography    31
Reason Unclassified        31
Label                      31
Reasons                    31
word_count                 31
dtype: int64

In [30]:
# count the number of arguments with at least 100 words
i_df[i_df['word_count'] >= 100].count()

post_id                    376
issue                      376
argument                   376
Toxic Emotions             376
Excessive Intensity        376
Emotional Deception        376
Missing Commitment         376
Missing Seriousness        376
Missing Openness           376
Missing Intelligibility    376
Unclear Meaning            376
Missing Relevance          376
Confusing Reasoning        376
Other Reasons              376
Detrimental Orthography    376
Reason Unclassified        376
Label                      376
Reasons                    376
word_count                 376
dtype: int64

In [33]:
i_df_long = i_df[i_df['word_count'] >= 100]

In [35]:
i_df_long.to_csv('../data/appropriateness-corpus/inappropriate_with_reasons_conservative_long.csv', index=False)

In [36]:
i_df_long[['issue', 'argument', 'Label', 'Reasons', 'word_count']].sample(4)

Unnamed: 0,issue,argument,Label,Reasons,word_count
1067,How legally binding is a triple dog dare?:,"there is NOTHING on the planet more binding than that of the Triple Dog Dare. Example:\r\n\r\nThong-Wearing Men Arrested at Wal-Mart \r\nThu Jul 29,10:59 AM ET\r\nAP\r\n\r\nSCOTTSBLUFF, Neb. - Two men who were arrested for walking through a Wal-Mart while wearing women's thong underwear blamed the stunt on a ""triple-dog dare,"" authorities said. \r\n\r\nThe men, ages 35 and 36, bought two pair of underwear at the store Tuesday, went into a bathroom and came out wearing only the thongs and T-shirts, police said. \r\n\r\nWitnesses said the men walked through the store and out to their car. \r\n\r\nPolice caught the men in the parking lot, and reviewed a surveillance tape before arresting them for public indecency and disorderly conduct. \r\n\r\nWhen asked why they were wearing thong underwear, one of the men said a friend ""triple-dog dared"" them. They will not be prosecuted, authorities said.",inappropriate,"['Missing Commitment', 'Missing Seriousness', 'Missing Openness', 'Missing Intelligibility', 'Unclear Meaning', 'Other Reasons', 'Reason Unclassified']",152
1020,Is it better to have a lousy father or to be fatherless:,I think in any case its better to have no father then to have a lousy father either way you still lose. When you have a father you want him to be the best he can be so you can learn from him as you grow up. If you were to have a lousy father then whats the point of having a father at all if he's just going to be lousy and he wouldn't teach you anything. It's always better to have a respectable father figure in your life then to have a lousy father figure.,inappropriate,"['Missing Intelligibility', 'Missing Relevance', 'Other Reasons', 'Reason Unclassified']",100
505,Cmv: i think dishwashers are a waste of time and money.:,"My reasoning is simple:\r\n\r\n- dishwashers require the dishes to be ""pre cleaned"". This already defeats the purpose, why did I pay hundreds of dollars for a machine that doesn't have food processing capabilities?\r\n- dishwashers rarely removed ingrained or dried gunk. This only worsens when you charge the dishwasher over a couple of days, and the older plates have already dried up. You have to take the still dirty plates and clean them manually.\r\n- dishwashers don't properly dry ""deep"" items like Tupperware and pots, if you aren't careful taking these out, you'll get everything else wet, once again making the process useless. \r\n\r\nI think overall that dishwashers are overrated, undercapable, overpriced appliances that have no place in a modern kitchen.",inappropriate,"['Missing Commitment', 'Missing Openness']",123
1068,Is torture ever justified? why or why not?:,"Before you write your paper, make sure that you have the facts.\r\n\r\nThe press sells papers by exaggerating the issues. Guantanamo was an excellent example. While the inmates were humiliated, they weren't tortured. The accusations still spring up, however, and the UN is using the accusation to bash the US.\r\n\r\nAdditionally, the UN has determined that force feeding prisoners that go on a hunger strike is torture. So, if you feed them, it's torture. If you don't and they starve, it's torture.\r\n\r\nFinally, prisoners will lie about treatment because it's a form of political power they can use while incarcerated.",inappropriate,"['Toxic Emotions', 'Emotional Deception', 'Missing Commitment', 'Missing Openness']",105
