### Importing all libs

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv("./dataset.csv")

## Data Cleaning

### Identifying unique channel names before standardization

In [None]:
print(df["Channel"].unique())

### Correct Channel names

In [None]:
df['Channel'] = df['Channel'].replace({
    "geo": "Geo News",
    "Geo": "Geo News",
    "GEO NEWS": "Geo News",
    "ARY": "ARY News",
    "ary": "ARY News",
    "ARYNEWS": "ARY News",
    "Samaa": "Samaa TV",
    "SAMAA TV": "Samaa TV",
    "Express": "Express News",
    "Express-News": "Express News",
    "hum news": "Hum News",
    "HUM News": "Hum News",
    "DawnNews": "Dawn News",
    "DAWN": "Dawn News"
})

print(df['Channel'].unique())



### Find All Journalist Names

In [None]:
print(df["Journalist"].unique())

### Makes Journalist Name Consistant and fix spelling mistakes

In [None]:
df["Journalist"] = df["Journalist"].str.title()
df["Journalist"] = df["Journalist"].replace({
    "Rauf Klassra": "Rauf Klasra",
    "Mohsin Raza Khan": "Mohsin Raza",
    "K. Khan": "Kamran Khan",
})
print(df["Journalist"].unique())

### Identify all cities and Regions names to standardize

In [None]:
print(df["City"].unique())
print(df["Region"].unique())

### Map Cities to Correct Regions

In [None]:

region_city_map = {
    'Punjab': ['Lahore', 'Multan', 'Rawalpindi'],
    'Sindh': ['Karachi', 'Hyderabad'],
    'KPK': ['Peshawar'],
    'Balochistan': ['Quetta'],
    'Islamabad': ['Islamabad'],
    'AJK': ['Muzaffarabad']
}

def correct_region(row):
    # rule 1: if region is AJK always set city to Muzaffarabad
    if row['Region'] == 'AJK':
        row['City'] = 'Muzaffarabad'
        return row

    # rule 2: for non-AJK regions fix region according to city
    for region, cities in region_city_map.items():
        if row['City'] in cities:
            row['Region'] = region
            return row
    return row

df = df.apply(correct_region, axis=1)


display(df[['City', 'Region']].drop_duplicates().sort_values(by='City').reset_index(drop=True))
print(df["Region"].unique())
print(df["City"].unique())

### Identify all topics

In [None]:
print(df["Topic"].unique())

### Fix topic based on headlines
   Doing manually based on keyword parmanent solution involve NLP use models to predict topic based on headline it miss edge cases like if headline conatins keywords from 2 or more categorzies it gives topic bases on what comes first
   

In [None]:

topic_keywords = {
    "Health": [
        "hospital", "doctor", "virus", "covid", "vaccine", "cases", "patients",
        "medicine", "shortage", "disease", "pandemic", "healthcare", "polio", "dengue"
    ],
    "Economy": [
        "budget", "imf", "inflation", "tax", "deficit", "exports", "imports",
        "trade", "revenue", "market", "stock", "growth", "investment", "finance"
    ],
    "Terrorism": [
        "attack", "blast", "bomb", "terrorist", "militant", "explosion", "security",
        "waziristan", "operation", "killed", "army", "soldier", "convoy", "suicide"
    ],
    "Sports": [
        "match", "cricket", "football", "hockey", "win", "tournament", "player",
        "psl", "worldcup", "babar", "qalandars", "karachi kings", "series"
    ],
    "Politics": [
        "election", "government", "assembly", "minister", "prime", "party",
        "pmln", "ppp", "pti", "parliament", "cabinet", "bill", "opposition", "rally"
    ],
    "Crime": [
        "murder", "arrested", "robbery", "police", "shooting", "kidnapping",
        "theft", "suspect", "gang", "investigation", "crime", "violence", "court"
    ],
    "Judiciary": [
        "court", "judge", "justice", "supreme", "high", "petition", "case",
        "hearing", "bench", "verdict", "order", "lawyer", "bar", "judicial"
    ],
    "Environment": [
        "flood", "rain", "climate", "weather", "storm", "pollution", "temperature",
        "heatwave", "earthquake", "relief", "disaster", "ndma", "environment"
    ],
    "Media": [
        "journalist", "anchor", "channel", "pemra", "censorship", "press",
        "freedom", "reporter", "tv", "news", "media", "talkshow", "backlash"
    ],
    "Education": [
        "school", "teacher", "student", "university", "exam", "degree",
        "curriculum", "education", "hec", "scholarship", "college", "protest"
    ]
}

def correct_topic(row):
    headline = row['Headline'].lower()
    current_topic = row['Topic']
    for topic, keywords in topic_keywords.items():
        if any(kw in headline for kw in keywords):
            return topic
    return current_topic

df['Topic'] = df.apply(correct_topic, axis=1)
display(df)


### Identifying jurnalist domains and adding flags and counting matches and mismatches

In [None]:
journalist_domain = {
    "Najam Sethi": ["Politics", "Media", "Sports"],
    "Dr. Shahid Masood": ["Politics", "Judiciary"],
    "Kamran Shahid": ["Politics", "Media"],
    "Matiullah Jan": ["Judiciary", "Politics"],
    "Rauf Klasra": ["Politics", "Economy"],
    "Ansar Abbasi": ["Judiciary", "Politics"],
    "Umar Cheema": ["Crime", "Politics"],
    "Shahzeb Khanzada": ["Politics", "Economy", "Media"],
    "Kamran Khan": ["Politics", "Economy", "Sports", "Media"],
    "Asma Shirazi": ["Politics", "Media"],
    "Mohsin Raza": ["Media", "Politics"],
    "Saleem Safi": ["Politics", "Terrorism"],
    "Talat Hussain": ["Politics", "Media"],
    "Owais Tohid": ["Media", "Politics"],
    "Nusrat Javed": ["Politics", "Media"]
}
df["ExpectedDomain"] = df["Journalist"].map(journalist_domain)
def check_mismatch(row):
    expected = row["ExpectedDomain"]
    topic = row["Topic"]
    if topic in expected:
        return 1
    else:
        return 0
    
df["DomainFlag"] = df.apply(check_mismatch, axis=1)
display(df[["Journalist", "Topic", "ExpectedDomain", "DomainFlag"]].head(10))
print(df["DomainFlag"].value_counts()) # Print the counts of matches and mismatches


### Checking incositences in newspaper if exist

In [None]:
print(df["Newspaper"].unique())

### Identiying and fixing revenue & AdSpend uints

In [None]:
def convert_to_pkr(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return x
    s = str(x).lower().replace(',', '').strip()
    if 'million' in s:
        num = float(re.search(r'[\d\.]+', s).group())
        return num * 1_000_000
    if 'lakh' in s or 'lac' in s:
        num = float(re.search(r'[\d\.]+', s).group())
        return num * 100_000
    if 'crore' in s or 'cr' in s:
        num = float(re.search(r'[\d\.]+', s).group())
        return num * 10_000_000
    # fallback numeric
    try:
        return float(s)
    except:
        return np.nan
    
df['Revenue'] = df['Revenue'].apply(convert_to_pkr)
df['AdSpend'] = df['AdSpend'].apply(convert_to_pkr)
# df['Revenue'] = df['Revenue'].apply(lambda x: int(np.floor(x)) if pd.notna(x) else np.nan)
# df['AdSpend'] = df['AdSpend'].apply(lambda x: int(np.floor(x)) if pd.notna(x) else np.nan)

### Fixing negative airtime value
 It is better to fill them with nan insted of positive so data dont become distorted

In [None]:
df['Airtime'] = df['Airtime'].apply(lambda x: x if x is not None and x >= 0 else np.nan)


### Fixing Out of limit TRP

In [None]:
df['Ratings'] = df['Ratings'].apply(lambda x: np.nan if pd.isna(x) or x < 0 else min(x, 100))

### Fixing controversy inconsisteces

In [None]:
df['ControversyFlag'] = df['ControversyFlag'].replace({
    'Yes': '1',
    'No': '0',
    '': np.nan
})
print(df["ControversyFlag"].unique())

### fixing inconsistence in missing flag col

In [None]:
df['MissingDataFlag'] = df.notna().all(axis=1).astype(int).astype(str)


### fixing bisas scroing using tanh squashing

In [None]:
df["BiasScore"] = np.tanh(df["BiasScore"])

### Fixing incorrect Language entries

In [None]:
df["Language"] = df["Language"].replace({
    "urdu": "Urdu",
    "ENG": "English",})