# EDA

## Import Libraries & Load Data

In [3]:
import pandas as pd
import numpy as np

# Load CSV
df = pd.read_csv('../pipeline/faculty_all_cleaned.csv')

# Basic info
print("Shape of dataset:", df.shape)
df.head()

Shape of dataset: (112, 15)


Unnamed: 0,name,profile_url,email,phone,address,faculty_web,education,biography,specialization,teaching,publications,research,faculty_type,source_file,text_for_embedding
0,Umang shah,https://www.daiict.ac.in/adjunct-faculty/umang...,umang_shah@dau.ac.in,,,,"[""PDP Programme, Aalto University, Finland""]",[],"[""Integrated Interdisciplinary Design"", ""Desig...",[],[],[],adjunct,adjunct.json,Integrated Interdisciplinary Design Design and...
1,Troy vasanth,https://www.daiict.ac.in/adjunct-faculty/troy-...,troy_vasanth@dau.ac.in,,,,"[""PGDPD (Animation Film Design), NID, Ahmedabad""]",[],"[""Sound Design, 3D, Animation and Motion Design""]",[],[],[],adjunct,adjunct.json,"Sound Design, 3D, Animation and Motion Design"
2,Swati priya,https://www.daiict.ac.in/adjunct-faculty/swati...,swati_priya@dau.ac.in,,,,"[""PhD (Heavy Metal Detection in Crops and Soil...",[],"[""Remote sensing and GIS, Precision agricultur...",[],[],[],adjunct,adjunct.json,"Remote sensing and GIS, Precision agriculture,..."
3,Subhas chandra nandy,https://www.daiict.ac.in/adjunct-faculty/subha...,subhas_nandy@dau.ac.in,079-68261717,"# 4105, FB-4, DAU, Gandhinagar, Gujarat, India...",,"[""PhD (Computer Science), University of Calcut...","[""Subhas C Nandy received the M.Sc. degree in ...","[""Algorithms, Data Structure, Graph Applicatio...","[""Algorithms, Data Structure, Computational Ge...",[],[],adjunct,adjunct.json,Subhas C Nandy received the M.Sc. degree in St...
4,Samit bhattacharya,https://www.daiict.ac.in/adjunct-faculty/samit...,samit_bhattacharya@dau.ac.in,,,,"[""PhD (Computer Science & Engineering), IIT Kh...",[],"[""Extended reality (virtual, augmented & mixed...",[],[],[],adjunct,adjunct.json,"Extended reality (virtual, augmented & mixed r..."


## Statistics Overview

In [4]:
total_faculty = df.shape[0]
total_columns = df.shape[1]

print(f"Total Faculty Records: {total_faculty}")
print(f"Total Attributes: {total_columns}")

Total Faculty Records: 112
Total Attributes: 15


Missing Values

In [5]:
missing_stats = pd.DataFrame({
    "Column": df.columns,
    "Missing Count": df.isnull().sum(),
    "Available (%)": (1 - df.isnull().mean()) * 100
}).sort_values(by="Available (%)", ascending=False)

missing_stats

Unnamed: 0,Column,Missing Count,Available (%)
name,name,0,100.0
profile_url,profile_url,0,100.0
education,education,0,100.0
publications,publications,0,100.0
teaching,teaching,0,100.0
specialization,specialization,0,100.0
biography,biography,0,100.0
research,research,0,100.0
faculty_type,faculty_type,0,100.0
source_file,source_file,0,100.0


In [6]:
contact_cols = ["email", "phone", "faculty_website"]

for col in contact_cols:
    if col in df.columns:
        percent = (df[col].notnull().mean()) * 100
        print(f"{col} available for {percent:.2f}% of faculty")

email available for 99.11% of faculty
phone available for 71.43% of faculty


In [7]:
import pandas as pd
import ast

def parse_specialization(val):
    # If already a list, return it
    if isinstance(val, list):
        return val

    # If NaN or None
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return []

    # Convert stringified lists
    val = str(val).strip()
    if val in ["", "[]"]:
        return []

    try:
        parsed = ast.literal_eval(val)
        return parsed if isinstance(parsed, list) else [parsed]
    except:
        return []

df["specialization"] = df["specialization"].apply(parse_specialization)

df["specialization"] = df["specialization"].apply(
    lambda lst: [
        s.strip().lower()
        for item in lst
        for s in str(item).split(",")
    ]
)

all_specs = df["specialization"].explode()

all_specs = all_specs[
    (all_specs.notna()) &
    (all_specs != "") &
    (~all_specs.str.contains("area|interest|research", na=False))
]

unique_specs = all_specs.nunique()
print("Unique Specializations:", unique_specs)

top_specializations = all_specs.value_counts().head(10)
top_specializations

Unique Specializations: 391


specialization
machine learning               7
computer vision                7
information retrieval          6
natural language processing    6
image processing               5
signal processing              3
photography                    3
algorithms                     3
vlsi                           2
theoretical physics            2
Name: count, dtype: int64

In [8]:
def parse_publications(pub):
    if isinstance(pub, list):
        return pub
    try:
        return ast.literal_eval(pub)
    except:
        return []

df['publications_list'] = df['publications'].apply(parse_publications)

# Now count publications properly
df['num_publications'] = df['publications_list'].apply(len)

# Max publications
max_pub_row = df.loc[df['num_publications'].idxmax()]
print(f"Faculty with max publications: {max_pub_row['name']} ({max_pub_row['num_publications']} publications)")

# Min publications
min_pub_row = df.loc[df['num_publications'].idxmin()]
print(f"Faculty with min publications: {min_pub_row['name']} ({min_pub_row['num_publications']} publications)")

# Average publications
avg_pub = df['num_publications'].mean()
print(f"Average number of publications: {avg_pub:.2f}")

Faculty with max publications: Tapas kumar maiti (61 publications)
Faculty with min publications: Umang shah (0 publications)
Average number of publications: 8.46


In [9]:
df["publication_category"] = pd.cut(
    df["num_publications"],
    bins=[-1, 0, 5, 15, np.inf],
    labels=["No publications", "1–5", "6–15", "15+"]
)

# Count how many faculty in each category
df["publication_category"].value_counts()

publication_category
No publications    42
6–15               30
1–5                22
15+                18
Name: count, dtype: int64

In [10]:
df["text_length"] = df["biography"].astype(str).apply(len)

print("Average text length:", df["text_length"].mean())
print("Max text length:", df["text_length"].max())

Average text length: 541.7053571428571
Max text length: 2443


Faculty Count per Source File

In [11]:
if "faculty_type" in df.columns:
    print("Number of unique faculty types:", df["faculty_type"].nunique())
    df["faculty_type"].value_counts()

Number of unique faculty types: 5


In [12]:
source_stats = df["source_file"].value_counts().to_frame("count")
source_stats["percentage"] = (df["source_file"].value_counts(normalize=True) * 100).round(2)
source_stats

Unnamed: 0_level_0,count,percentage
source_file,Unnamed: 1_level_1,Unnamed: 2_level_1
faculty.json,69,61.61
adjunct.json,26,23.21
international_adjunct.json,11,9.82
practice.json,4,3.57
distinguished.json,2,1.79


PhD vs Non-PhD Faculty

In [13]:
# Create boolean column for PhD
df["has_phd"] = df["education"].str.contains("phd", case=False, na=False)

# Counts and percentages in one DataFrame
phd_stats = df["has_phd"].value_counts().to_frame("count")
phd_stats["percentage"] = (df["has_phd"].value_counts(normalize=True) * 100).round(2)
phd_stats.index = ["Without PhD", "With PhD"]  # optional, for readability

phd_stats

Unnamed: 0,count,percentage
Without PhD,95,84.82
With PhD,17,15.18
