In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [8]:
df = pd.read_csv('/Users/tuckermcneal/Desktop/Personal Info/JobApplicants.csv')


Unnamed: 0,Company,Title,Location,Applied Date,Company Sector,Sentiment,Success
0,Daikin Applied,Data Analyst Intern,"Hybrid, based in Plymouth, MN",1/17/26,Manufacturing,2,0
1,Uline,Software Developer Internship - Summer 2026,"Onsite, based in Pleasant Prairie, WI",1/17/26,Wholesale Trade,1,0
2,Husco,Automation & AI Intern,"Onsite, based in Waukesha, WI",1/17/26,Manufacturing,1,0
3,Johnson Controls,Software Engineering Intern,"Hybrid, based in Milwaukee, WI, or Glendale, WI",1/15/26,Engineering & Construction,2,1
4,iStream Financial Services,Software Engineering Intern,"Hybrid, based in Brookfield, WI",1/15/26,Internet & Software,2,1


In [10]:
#Change the date to a readable form
df["Applied Date"] = pd.to_datetime(df["Applied Date"])

#Change success metric into a purely numeric form
df["Success"] = pd.to_numeric(df["Success"], errors="coerce")

print(df.head())
print(df.info())

                      Company                                        Title  \
0              Daikin Applied                          Data Analyst Intern   
1                       Uline  Software Developer Internship - Summer 2026   
2                       Husco                       Automation & AI Intern   
3            Johnson Controls                  Software Engineering Intern   
4  iStream Financial Services                  Software Engineering Intern   

                                          Location Applied Date  \
0                    Hybrid, based in Plymouth, MN   2026-01-17   
1            Onsite, based in Pleasant Prairie, WI   2026-01-17   
2                    Onsite, based in Waukesha, WI   2026-01-17   
3  Hybrid, based in Milwaukee, WI, or Glendale, WI   2026-01-15   
4                  Hybrid, based in Brookfield, WI   2026-01-15   

               Company Sector  Sentiment  Success  
0               Manufacturing          2        0  
1             Wholesale 

In [16]:

#Add and additional column "Work Type" 
df["Work Type"] = "Other"
df.loc[df["Location"].str.contains("Hybrid", case=False, na=False), "Work Type"] = "Hybrid"
df.loc[df["Location"].str.contains("Onsite", case=False, na=False), "Work Type"] = "Onsite"
df.loc[df["Location"].str.contains("Remote", case=False, na=False), "Work Type"] = "Remote"


Unnamed: 0,Company,Title,Location,Applied Date,Company Sector,Sentiment,Success,Work Type,State
0,Daikin Applied,Data Analyst Intern,"Hybrid, based in Plymouth, MN",2026-01-17,Manufacturing,2,0,Hybrid,
1,Uline,Software Developer Internship - Summer 2026,"Onsite, based in Pleasant Prairie, WI",2026-01-17,Wholesale Trade,1,0,Onsite,
2,Husco,Automation & AI Intern,"Onsite, based in Waukesha, WI",2026-01-17,Manufacturing,1,0,Onsite,
3,Johnson Controls,Software Engineering Intern,"Hybrid, based in Milwaukee, WI, or Glendale, WI",2026-01-15,Engineering & Construction,2,1,Hybrid,
4,iStream Financial Services,Software Engineering Intern,"Hybrid, based in Brookfield, WI",2026-01-15,Internet & Software,2,1,Hybrid,


In [18]:
# Sector Success Rate
overall_success_rate = df["Success"].mean()
print("Overall success rate:", round(overall_success_rate * 100, 1), "%")

sector_stats = df.groupby("Company Sector")["Success"].agg(["count", "mean"])
sector_stats.rename(columns={"count": "Applications", "mean": "Success Rate"}, inplace=True)
print(sector_stats)


Overall success rate: 11.1 %
                                     Applications  Success Rate
Company Sector                                                 
Accounting                                      1      0.000000
CPG                                             1      0.000000
Construction                                    2      0.500000
Engineering & Construction                      1      1.000000
Financial Services                              2      0.000000
Food & Beverage                                 3      0.000000
Government - Local, State & Federal             2      0.000000
Healthcare                                      1      1.000000
Higher Education                                1      0.000000
Insurance                                       2      0.000000
Internet & Software                             8      0.125000
Investment / Portfolio Management               5      0.000000
Management Consulting                           1      1.000000
Manufacturi

In [22]:
# Success rate by sentiment score
sentiment_stats = df.groupby("Sentiment")["Success"].mean()
print(sentiment_stats)

# Success rate by work type
worktype_stats = df.groupby("Work Type")["Success"].agg(["count", "mean"])
worktype_stats.rename(columns={"count": "Applications", "mean": "Success Rate"}, inplace=True)
print(worktype_stats)

Sentiment
1    0.050000
2    0.090909
3    0.300000
Name: Success, dtype: float64
           Applications  Success Rate
Work Type                            
Hybrid               19      0.210526
Onsite               26      0.115385
Other                 9      0.000000
Remote                9      0.000000


In [24]:
#Sort List based on date
df_dates = df.set_index("Applied Date").sort_index()

#Print the applications by each week
weekly_apps = df_dates["Success"].resample("W").count()
print(weekly_apps)

# Weekly success rate
weekly_success_rate = df_dates["Success"].resample("W").mean()
print(weekly_success_rate)


Applied Date
2025-03-23     1
2025-03-30     1
2025-04-06     0
2025-04-13     1
2025-04-20     0
2025-04-27     0
2025-05-04     0
2025-05-11     0
2025-05-18     0
2025-05-25     0
2025-06-01     0
2025-06-08     0
2025-06-15     0
2025-06-22     0
2025-06-29     0
2025-07-06     0
2025-07-13     0
2025-07-20     0
2025-07-27     0
2025-08-03     0
2025-08-10     0
2025-08-17     0
2025-08-24     0
2025-08-31     0
2025-09-07     3
2025-09-14     7
2025-09-21     5
2025-09-28     0
2025-10-05     3
2025-10-12    13
2025-10-19     2
2025-10-26     5
2025-11-02     0
2025-11-09     0
2025-11-16     2
2025-11-23     1
2025-11-30     5
2025-12-07     1
2025-12-14     2
2025-12-21     2
2025-12-28     2
2026-01-04     0
2026-01-11     1
2026-01-18     6
Freq: W-SUN, Name: Success, dtype: int64
Applied Date
2025-03-23    0.000000
2025-03-30    0.000000
2025-04-06         NaN
2025-04-13    0.000000
2025-04-20         NaN
2025-04-27         NaN
2025-05-04         NaN
2025-05-11         NaN
2

In [29]:


feature_cols = ["Company Sector", "Work Type", "Sentiment"]
X = pd.get_dummies(df[feature_cols], drop_first=True)
y = df["Success"]

# Drop rows with missing target
mask = y.notna()
X, y = X[mask], y[mask]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94        17
           1       0.00      0.00      0.00         2

    accuracy                           0.89        19
   macro avg       0.45      0.50      0.47        19
weighted avg       0.80      0.89      0.85        19

