# Data Preprocessing and Analysis

## Install and Imports

In [None]:
!pip install -q datasets transformers[torch]
!pip install accelerate -U -q
!pip install -q huggingface_hub
!pip install -q wandb

In [None]:
from urllib import request
import os
import csv

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Data

## Download Data and Utils

In [None]:
def fetch_url(module_url):
  module_name = module_url.split('/')[-1]
  print(f'Fetching {module_url}')
  #with open("file_1.txt") as f1, open("file_2.txt") as f2
  with request.urlopen(module_url) as f, open(module_name,'w') as outf:
    a = f.read()
    outf.write(a.decode('utf-8'))

Download `training.csv`

In [None]:
train_parids_labels_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits/train_semeval_parids-labels.csv"
fetch_url(train_parids_labels_url)

In [None]:
train_url = "https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv"
fetch_url(train_url)


In [None]:
dev_url = "https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits/dev_semeval_parids-labels.csv"
fetch_url(dev_url)

In [None]:
dev_parids_labels_url = "https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits/dev_semeval_parids-labels.csv"
fetch_url(dev_parids_labels_url)

Download `evaluation.py`

In [None]:
eval_py_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
fetch_url(eval_py_url)

## Transform and Load Data

In [None]:
def load_test(test_path):
  rows=[]
  with open(test_path) as f:
    for line in f:
      t=line.strip().split('\t')
      rows.append(t)
  test_set_df = pd.DataFrame(rows, columns="par_id art_id keyword country text".split())

In [None]:
def load_train(train_path):
  rows=[]
  with open(os.path.join(train_path)) as f:
    for line in f.readlines()[4:]:
      par_id=line.strip().split('\t')[0]
      art_id = line.strip().split('\t')[1]
      keyword=line.strip().split('\t')[2]
      country=line.strip().split('\t')[3]
      t=line.strip().split('\t')[4]#.lower()
      l=line.strip().split('\t')[-1]
      if l=='0' or l=='1':
        lbin=0
      else:
        lbin=1
      rows.append(
        {'par_id':par_id,
        'art_id':art_id,
        'keyword':keyword,
        'country':country,
        'text':t,
        'label':lbin,
        'orig_label':l
        }
      )
  train_df = pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label'])
  return train_df

In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:

train_path = "./dontpatronizeme_pcl.tsv"
df = load_train(train_path)

In [None]:
df.head()

In [None]:
df["country"].value_counts()

#### Data insights

In [None]:
def is_less_than_20_words(text):
    return  len(text.split()) <= 20

In [None]:
sampled_df = df[(df['orig_label'] == "4")]
# (df['country'] == "us") & 
filtered_df = sampled_df[sampled_df['text'].apply(is_less_than_20_words)]

In [None]:
for index, row in filtered_df.iterrows():
    print(f"Paragraph ID: {row['par_id']}")
    print("Country: ", row["country"])
    print("Text:")
    print(row['text'])
    print("-" * 80)  # Print a separator line for readability

In [None]:

df.groupby('orig_label').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df["label"].value_counts()

#### Creating Splits

In [None]:
data = df

In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
trids.par_id = trids.par_id.astype(str)
trids.head()

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  orig_label = data.loc[data.par_id == parid].orig_label.values[0]
  
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label,
      # "orig_label": orig_label
  })

In [None]:
trdf1 = pd.DataFrame(rows)
print(trdf1.info())
trdf1.head()

In [None]:
trdf1["label"].value_counts()

Do the same to create the validation set

In [None]:
valids = pd.read_csv('dev_semeval_parids-labels.csv')
valids.par_id = valids.par_id.astype(str)
valids.head()


In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(valids)):
  parid = valids.par_id[idx]
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  orig_label = data.loc[data.par_id == parid].orig_label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label,
      # 'orig_label': orig_label
  })

In [None]:
valdf1 = pd.DataFrame(rows)
valdf1.head()

### Push Data to HF Hub

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
data = DatasetDict({
    "train": Dataset.from_pandas(trdf1),
    "valid": Dataset.from_pandas(valdf1),
})

In [None]:
data.push_to_hub("ImperialIndians23/nlp_cw_data_unprocessed", token="put your token here")


## Data Analysis

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")

import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df = trdf1

In [None]:
df.head()

In [None]:
df['text_length'] = df['text'].apply(len)
print("Max Text Length:", df["text_length"].max())
print("Mean Text Length:", df['text_length'].mean())

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df)
plt.title('Distribution of Class Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='text_length', data=df)
plt.title('Text Length vs. Label')
plt.xlabel('Label')
plt.ylabel('Text Length')
plt.show()

- Fewer outliers for 1 than for 0.
- Median text length is similar.

In [None]:
plt.figure(figsize=(12, 6))

ax = sns.histplot(data=df, x='text_length', hue='label', multiple="stack", kde=True, palette="Set2")
plt.title('Text Length Distribution by Label')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.xlim(0, 6000)

legend = ax.get_legend()
legend.set_title('Label')

plt.show()


- Similar right-skew of data.
- Majority of texts are concentrated in the lower range of lengths (0-1000).

In [None]:
community_label_counts = df.groupby(['community', 'label']).size().reset_index(name='counts')

pivot_table = community_label_counts.pivot(index='community', columns='label', values='counts').fillna(0)

pivot_table_normalized = pivot_table.div(pivot_table.sum(axis=1), axis=0)

plt.figure(figsize=(14, 8))
pivot_table_normalized.plot(kind='bar', stacked=True, colormap='viridis', figsize=(14, 8))
plt.title('Proportion of Labels within Each Community')
plt.xlabel('Community')
plt.ylabel('Proportion of Labels')
plt.legend(title='Label', loc='upper right')
plt.show()

In [None]:

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['processed_text'] = df['text'].apply(lambda x: ' '.join(preprocess(x)))

texts_label_0 = df[df['label'] == 0]['processed_text'].str.cat(sep=' ')
texts_label_1 = df[df['label'] == 1]['processed_text'].str.cat(sep=' ')

# Generate word clouds
wordcloud_0 = WordCloud(width=800, height=400, background_color='white').generate(texts_label_0)
wordcloud_1 = WordCloud(width=800, height=400, background_color='white').generate(texts_label_1)


In [None]:
plt.figure(figsize=(45, 20))

plt.subplot(1, 2, 1)
plt.imshow(wordcloud_0, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Non Patronizing Text')

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_1, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Patronizing Text')

plt.show()

**Conclusion**:
1. Text length exhibits a wide range but is quite similar across both classes, with a similar median.
2. There are outliers, particularly for label 0, which could influence the model's performance.
3. There is a class imbalance in the dataset with label 1 being significantly lesser in number.
4. Based on the `Labels in each Community` plot, we can see that `homeless`, `in-need` and `poor-families` are more often patronized.


In [None]:
df.to_csv("training_8375.csv", index=False)

## Preprocessing Data

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    # Re-joining tokens
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text



In [None]:
trdf1['text'] = trdf1['text'].apply(preprocess_text)

In [None]:
valdf1['text'] = valdf1['text'].apply(preprocess_text)

In [None]:
valdf1.head()

### Push Data to HF Hub

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
data = DatasetDict({
    "train": Dataset.from_pandas(trdf1),
    "valid": Dataset.from_pandas(valdf1),
})

In [None]:
data

In [None]:
!huggingface-cli login --token=put your token here

In [None]:
data.push_to_hub("ImperialIndians23/nlp_cw_data_processed", token="put your token here")