### Importing all libs

In [71]:
import pandas as pd
import numpy as np
import re

In [72]:
df = pd.read_csv("./dataset.csv")

## Data Cleaning

### Identifying unique channel names before standardization

In [73]:
print(df["Channel"].unique())

['ARY News' 'HUM News' 'Samaa' 'DawnNews' 'SAMAA TV' 'geo' 'AbbTakk News'
 'hum news' 'Geo' 'Express-News' 'GEO NEWS' 'ARYNEWS' 'ary' 'DAWN'
 'Express' 'ARY']


### Correct Channel names

In [74]:
df['Channel'] = df['Channel'].replace({
    "geo": "Geo News",
    "Geo": "Geo News",
    "GEO NEWS": "Geo News",
    "ARY": "ARY News",
    "ary": "ARY News",
    "ARYNEWS": "ARY News",
    "Samaa": "Samaa TV",
    "SAMAA TV": "Samaa TV",
    "Express": "Express News",
    "Express-News": "Express News",
    "hum news": "Hum News",
    "HUM News": "Hum News",
    "DawnNews": "Dawn News",
    "DAWN": "Dawn News"
})

print(df['Channel'].unique())



['ARY News' 'Hum News' 'Samaa TV' 'Dawn News' 'Geo News' 'AbbTakk News'
 'Express News']


### Find All Journalist Names

In [75]:
print(df["Journalist"].unique())

['Najam Sethi' 'Dr. Shahid Masood' 'Kamran Shahid' 'Matiullah Jan'
 'RAUF KLASSRA' 'Ansar Abbasi' 'Umar Cheema' 'Shahzeb Khanzada'
 'shahzeb khanzada' 'Kamran Khan' 'K. Khan' 'Asma Shirazi'
 'Mohsin Raza Khan' 'Saleem Safi' 'Talat Hussain' 'Rauf Klasra'
 'Owais Tohid' 'Nusrat Javed' 'Mohsin Raza']


### Makes Journalist Name Consistant and fix spelling mistakes

In [76]:
df["Journalist"] = df["Journalist"].str.title()
df["Journalist"] = df["Journalist"].replace({
    "Rauf Klassra": "Rauf Klasra",
    "Mohsin Raza Khan": "Mohsin Raza",
    "K. Khan": "Kamran Khan",
})
print(df["Journalist"].unique())

['Najam Sethi' 'Dr. Shahid Masood' 'Kamran Shahid' 'Matiullah Jan'
 'Rauf Klasra' 'Ansar Abbasi' 'Umar Cheema' 'Shahzeb Khanzada'
 'Kamran Khan' 'Asma Shirazi' 'Mohsin Raza' 'Saleem Safi' 'Talat Hussain'
 'Owais Tohid' 'Nusrat Javed']


### Identify all cities and Regions names to standardize

In [77]:
print(df["City"].unique())
print(df["Region"].unique())

['Multan' 'Quetta' 'Rawalpindi' 'Karachi' 'Peshawar' 'Islamabad' 'Lahore'
 'Hyderabad']
['Islamabad' 'Sindh' 'Balochistan' 'KPK' 'Punjab' 'AJK']


### Map Cities to Correct Regions

In [78]:

region_city_map = {
    'Punjab': ['Lahore', 'Multan', 'Rawalpindi'],
    'Sindh': ['Karachi', 'Hyderabad'],
    'KPK': ['Peshawar'],
    'Balochistan': ['Quetta'],
    'Islamabad': ['Islamabad'],
    'AJK': ['Muzaffarabad']
}

def correct_region(row):
    # rule 1: if region is AJK always set city to Muzaffarabad
    if row['Region'] == 'AJK':
        row['City'] = 'Muzaffarabad'
        return row

    # rule 2: for non-AJK regions fix region according to city
    for region, cities in region_city_map.items():
        if row['City'] in cities:
            row['Region'] = region
            return row
    return row

df = df.apply(correct_region, axis=1)


display(df[['City', 'Region']].drop_duplicates().sort_values(by='City').reset_index(drop=True))
print(df["Region"].unique())
print(df["City"].unique())

Unnamed: 0,City,Region
0,Hyderabad,Sindh
1,Islamabad,Islamabad
2,Karachi,Sindh
3,Lahore,Punjab
4,Multan,Punjab
5,Muzaffarabad,AJK
6,Peshawar,KPK
7,Quetta,Balochistan
8,Rawalpindi,Punjab


['Punjab' 'Balochistan' 'Sindh' 'KPK' 'Islamabad' 'AJK']
['Multan' 'Quetta' 'Rawalpindi' 'Karachi' 'Peshawar' 'Islamabad' 'Lahore'
 'Muzaffarabad' 'Hyderabad']


### Identify all topics

In [79]:
print(df["Topic"].unique())

['Sports' 'Crime' 'Health' 'Terrorism' 'Media' 'Education' 'Judiciary'
 'Politics' 'Economy']


### Fix topic based on headlines
   Doing manually based on keyword parmanent solution involve NLP use models to predict topic based on headline it miss edge cases like if headline conatins keywords from 2 or more categorzies it gives topic bases on what comes first
   

In [80]:

topic_keywords = {
    "Health": [
        "hospital", "doctor", "virus", "covid", "vaccine", "cases", "patients",
        "medicine", "shortage", "disease", "pandemic", "healthcare", "polio", "dengue"
    ],
    "Economy": [
        "budget", "imf", "inflation", "tax", "deficit", "exports", "imports",
        "trade", "revenue", "market", "stock", "growth", "investment", "finance"
    ],
    "Terrorism": [
        "attack", "blast", "bomb", "terrorist", "militant", "explosion", "security",
        "waziristan", "operation", "killed", "army", "soldier", "convoy", "suicide"
    ],
    "Sports": [
        "match", "cricket", "football", "hockey", "win", "tournament", "player",
        "psl", "worldcup", "babar", "qalandars", "karachi kings", "series"
    ],
    "Politics": [
        "election", "government", "assembly", "minister", "prime", "party",
        "pmln", "ppp", "pti", "parliament", "cabinet", "bill", "opposition", "rally"
    ],
    "Crime": [
        "murder", "arrested", "robbery", "police", "shooting", "kidnapping",
        "theft", "suspect", "gang", "investigation", "crime", "violence", "court"
    ],
    "Judiciary": [
        "court", "judge", "justice", "supreme", "high", "petition", "case",
        "hearing", "bench", "verdict", "order", "lawyer", "bar", "judicial"
    ],
    "Environment": [
        "flood", "rain", "climate", "weather", "storm", "pollution", "temperature",
        "heatwave", "earthquake", "relief", "disaster", "ndma", "environment"
    ],
    "Media": [
        "journalist", "anchor", "channel", "pemra", "censorship", "press",
        "freedom", "reporter", "tv", "news", "media", "talkshow", "backlash"
    ],
    "Education": [
        "school", "teacher", "student", "university", "exam", "degree",
        "curriculum", "education", "hec", "scholarship", "college", "protest"
    ]
}

def correct_topic(row):
    headline = row['Headline'].lower()
    current_topic = row['Topic']
    for topic, keywords in topic_keywords.items():
        if any(kw in headline for kw in keywords):
            return topic
    return current_topic

df['Topic'] = df.apply(correct_topic, axis=1)
display(df)


Unnamed: 0,ID,Journalist,Channel,Newspaper,Region,City,Topic,Headline,Ratings,Revenue,...,BiasScore,Viewership,Shares,AdSpend,ControversyFlag,MissingDataFlag,Date,Language,PoliticalAffiliation,SocialMediaInteractions
0,0,Najam Sethi,ARY News,The News,Punjab,Multan,Health,Polio cases reported in KPK,10.613151,11648570,...,,1.756573e+06,136184.0,4452987.876,No,1.0,2021-06-21,English,Opposition,
1,1,Dr. Shahid Masood,Hum News,Dawn,Balochistan,Quetta,Economy,Budget deficit reaches record high,27.545705,5 million,...,5.0,1.546170e+06,,1 crore,1,,2019-11-19,urdu,,14461.0
2,2,Kamran Shahid,Samaa TV,Jang,Punjab,Rawalpindi,Terrorism,Attack on Army convoy in Waziristan,50.387473,14072656,...,,,,50 lakh,Yes,,2024-05-07,urdu,Pro-Govt,33436.0
3,3,Matiullah Jan,Dawn News,Nawa-i-Waqt,Punjab,Multan,Health,Hospitals face medicine shortages,,812673,...,2.0,1.083972e+06,16712.0,1 crore,1,,2019-10-03,urdu,Neutral,
4,4,Rauf Klasra,Samaa TV,The News,Sindh,Karachi,Crime,Chief Justice orders suo moto on Karachi violence,,,...,,7.006350e+05,,3975468.534,,0.0,2018-02-07,Urdu,Neutral,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,Saleem Safi,AbbTakk News,The News,Sindh,Karachi,Media,Ban on talkshows sparks outrage,93.527230,,...,,3.635476e+00,,50 lakh,Yes,0.0,2019-04-24,ENG,Neutral,1963.0
14996,14996,Saleem Safi,Hum News,The News,Balochistan,Quetta,Environment,Flood relief funds under scrutiny,,9191778,...,0.0,3.935510e+05,,50 lakh,,,2024-12-20,English,Neutral,
14997,14997,Shahzeb Khanzada,ARY News,Jang,KPK,Peshawar,Politics,Khan addresses rally in Multan,,5 million,...,4.0,1.472680e+06,,3355467.497,No,0.0,2018-11-01,urdu,Opposition,29609.0
14998,14998,Saleem Safi,Dawn News,Nawa-i-Waqt,Punjab,Multan,Politics,Aid for Balochistan still missing,23.414386,,...,3.0,,65194.0,1 crore,Yes,0.0,2023-08-01,urdu,,18882.0


### Identifying jurnalist domains and adding flags and counting matches and mismatches

In [81]:
journalist_domain = {
    "Najam Sethi": ["Politics", "Media", "Sports"],
    "Dr. Shahid Masood": ["Politics", "Judiciary"],
    "Kamran Shahid": ["Politics", "Media"],
    "Matiullah Jan": ["Judiciary", "Politics"],
    "Rauf Klasra": ["Politics", "Economy"],
    "Ansar Abbasi": ["Judiciary", "Politics"],
    "Umar Cheema": ["Crime", "Politics"],
    "Shahzeb Khanzada": ["Politics", "Economy", "Media"],
    "Kamran Khan": ["Politics", "Economy", "Sports", "Media"],
    "Asma Shirazi": ["Politics", "Media"],
    "Mohsin Raza": ["Media", "Politics"],
    "Saleem Safi": ["Politics", "Terrorism"],
    "Talat Hussain": ["Politics", "Media"],
    "Owais Tohid": ["Media", "Politics"],
    "Nusrat Javed": ["Politics", "Media"]
}
df["ExpectedDomain"] = df["Journalist"].map(journalist_domain)
def check_mismatch(row):
    expected = row["ExpectedDomain"]
    topic = row["Topic"]
    if topic in expected:
        return 1
    else:
        return 0
    
df["DomainFlag"] = df.apply(check_mismatch, axis=1)
display(df[["Journalist", "Topic", "ExpectedDomain", "DomainFlag"]].head(10))
print(df["DomainFlag"].value_counts()) # Print the counts of matches and mismatches


Unnamed: 0,Journalist,Topic,ExpectedDomain,DomainFlag
0,Najam Sethi,Health,"[Politics, Media, Sports]",0
1,Dr. Shahid Masood,Economy,"[Politics, Judiciary]",0
2,Kamran Shahid,Terrorism,"[Politics, Media]",0
3,Matiullah Jan,Health,"[Judiciary, Politics]",0
4,Rauf Klasra,Crime,"[Politics, Economy]",0
5,Ansar Abbasi,Sports,"[Judiciary, Politics]",0
6,Ansar Abbasi,Health,"[Judiciary, Politics]",0
7,Umar Cheema,Media,"[Crime, Politics]",0
8,Shahzeb Khanzada,Economy,"[Politics, Economy, Media]",1
9,Shahzeb Khanzada,Terrorism,"[Politics, Economy, Media]",0


DomainFlag
0    10911
1     4089
Name: count, dtype: int64


### Checking incositences in newspaper if exist

In [82]:
print(df["Newspaper"].unique())

['The News' 'Dawn' 'Jang' 'Nawa-i-Waqt' 'Express Tribune' 'Daily Pakistan']


### Identiying and fixing revenue & AdSpend uints

In [83]:
def convert_to_pkr(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return x
    s = str(x).lower().replace(',', '').strip()
    if 'million' in s:
        num = float(re.search(r'[\d\.]+', s).group())
        return num * 1_000_000
    if 'lakh' in s or 'lac' in s:
        num = float(re.search(r'[\d\.]+', s).group())
        return num * 100_000
    if 'crore' in s or 'cr' in s:
        num = float(re.search(r'[\d\.]+', s).group())
        return num * 10_000_000
    # fallback numeric
    try:
        return float(s)
    except:
        return np.nan
    
df['Revenue'] = df['Revenue'].apply(convert_to_pkr)
df['AdSpend'] = df['AdSpend'].apply(convert_to_pkr)
# df['Revenue'] = df['Revenue'].apply(lambda x: int(np.floor(x)) if pd.notna(x) else np.nan)
# df['AdSpend'] = df['AdSpend'].apply(lambda x: int(np.floor(x)) if pd.notna(x) else np.nan)

### Fixing negative airtime value
 It is better to fill them with nan insted of positive so data dont become distorted

In [84]:
df['Airtime'] = df['Airtime'].apply(lambda x: x if x is not None and x >= 0 else np.nan)


### Fixing Out of limit TRP

In [85]:
df['Ratings'] = df['Ratings'].apply(lambda x: np.nan if pd.isna(x) or x < 0 else min(x, 100))

### Fixing controversy inconsisteces

In [86]:
df['ControversyFlag'] = df['ControversyFlag'].replace({
    'Yes': '1',
    'No': '0',
    '': np.nan
})
print(df["ControversyFlag"].unique())
df.to_csv("dataset_cleaned.csv", index=False)

['0' '1' nan]


### fixing inconsistence in missing flag col

In [88]:
df['MissingDataFlag'] = df.notna().all(axis=1).astype(int).astype(str)
print(df['MissingDataFlag'].unique())
df.to_csv("dataset_cleaned.csv", index=False)

['0' '1']
