In [25]:
import pandas as pd
import numpy as np
import math

In [26]:
data = pd.read_csv("cfbweek3.csv")

In [27]:
priors = pd.read_csv("MasseyRatings_2024.csv")  # Team, MasseyRating
prior_dict = dict(zip(priors['Team'], priors['MasseyRating']))

In [28]:
filtered_data = data[
    (data["Completed"] == True)  # Ensure games are completed
    & (data["Week"] <= 10)
    & (data["SeasonType"] != 'postseason')
]

# Extract necessary columns
games = filtered_data[['HomeTeam', 'AwayTeam', 'HomePoints', 'AwayPoints']]

# Calculate score differential
games['Score Differential'] = abs(games['HomePoints'] - games['AwayPoints'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games['Score Differential'] = abs(games['HomePoints'] - games['AwayPoints'])


In [29]:
import numpy as np
import pandas as pd

# Unique list of teams
teams = set(games['HomeTeam']).union(set(games['AwayTeam']))
teams_list = sorted(list(teams))  # sorted for consistency
team_to_idx = {team: i for i, team in enumerate(teams_list)}

In [30]:
# Build a list of equations for each game
# We'll end up with #games equations (+ priors + 1 sum-constraint), and we have #teams unknowns.

num_games = len(games)
n = len(teams_list)
winners_bonus = 2.75

# Encode function so changing it once applies to BOTH real games and priors
def encode_mov(mov: float, method = 'sqrt_with_bonus',cap = 35, winners_bonus = 2.75) -> float:
    if method == 'sqrt_with_bonus':
        target = np.sign(mov) * np.sqrt(abs(mov)) + np.sign(mov) * winners_bonus
    elif method == 'raw_dif':
        target = mov
    elif method == 'capped_with_bonus':
        target = min(mov, cap) + np.sign(mov) * winners_bonus
    return target

# --- Count games per team ---
games_played = {t: 0 for t in teams_list}
for _, row in games.iterrows():
    games_played[row['HomeTeam']] += 1
    games_played[row['AwayTeam']] += 1

# --- Effective games (real + prior) for normalization ---
prior_weight = .5       # 1.0 ~ one game; increase to make priors fade more slowly
prior_as_mov_scale = 1.0    # optional scaling to map prior magnitudes to MOV units if needed

g_eff = {t: games_played[t] + prior_weight for t in teams_list}

# We'll build rows dynamically, then stack at the end so we can easily append priors.
rows = []
targets = []

# --- Real games (row-weighted) ---
for _, row in games.iterrows():
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    home_pts  = row['HomePoints']
    away_pts  = row['AwayPoints']
    n
    i = team_to_idx[home_team]
    j = team_to_idx[away_team]

    # rating_i - rating_j = encoded(MOV)
    eq = np.zeros(n, dtype=float)
    eq[i] = 1.0
    eq[j] = -1.0

    mov = home_pts - away_pts
    target = encode_mov(mov, method='capped_with_bonus', cap = 35, winners_bonus= 2.75)  # <-- same encoder used for priors below

    # Row weight to normalize leverage by games played
    # w_ij = sqrt( 2 / (g_eff[i] + g_eff[j]) )
    w = np.sqrt(2.0 / (g_eff[home_team] + g_eff[away_team]))

    rows.append(eq * w)
    targets.append(target * w)

# --- Priors as "Week 0" pseudo-games vs neutral baseline (rating = 0) ---
# Same encoder; keep sqrt(prior_weight) so prior acts like N games, then apply the same normalization idea with g_eff(neutral)=0.
scale_prior = np.sqrt(prior_weight)

for team, prior_rating in prior_dict.items():
    idx = team_to_idx.get(team)
    if idx is None:
        continue  # skip priors for teams not in the current team set

    eq_prior = np.zeros(n, dtype=float)
    eq_prior[idx] = 1.0  # (team) - (neutral=0)

    mov_prior = prior_as_mov_scale * prior_rating
    target_prior = encode_mov(mov_prior, method='capped_with_bonus', cap = 35, winners_bonus= 2.75)

    # For the normalization, treat opponent (neutral) as g_eff = 0:
    # w_i0 = sqrt( 2 / (g_eff[i] + 0) ) = sqrt( 2 / g_eff[i] )
    w_prior = np.sqrt(2.0 / (g_eff[team]))

    # Combine: first weight as "prior_weight games" (scale_prior), then normalize row like real games (w_prior)
    rows.append(eq_prior * (scale_prior * w_prior))
    targets.append(target_prior * (scale_prior * w_prior))

# --- Sum-of-ratings = 0 constraint to anchor the system ---
sum_constraint = np.ones(n, dtype=float)
rows.append(sum_constraint)
targets.append(0.0)

# Stack into matrix/vector
M = np.vstack(rows)
y = np.array(targets, dtype=float)

# Solve in least squares sense (M @ r ~ y)
massey_ratings, residuals, rank, s = np.linalg.lstsq(M, y, rcond=None)

massey_df = pd.DataFrame({
    'Team': teams_list,
    'MasseyRating': massey_ratings
}).sort_values('MasseyRating', ascending=False)


In [31]:


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Example DataFrame

# Print full DataFrame
print(massey_df.reset_index(drop=True))

                                  Team  MasseyRating
0                               Oregon     44.617244
1                             Oklahoma     44.438529
2                           Penn State     42.550062
3                              Indiana     42.461311
4                              Georgia     41.111469
5                           Ohio State     40.995983
6                             Illinois     40.986668
7                                  USC     39.955736
8                                 Utah     39.884585
9                             Missouri     39.635628
10                  North Dakota State     39.255498
11                           Tennessee     38.253558
12                            Ole Miss     36.917065
13                               Miami     36.514106
14                             Houston     35.439094
15                                 LSU     35.297412
16                            Arkansas     34.607871
17                         North Texas     34.

In [32]:
# Add a new cell to generate betting lines for unplayed games using the existing `massey_df`.

# Find all unplayed games (not completed, regular season)
remaining_games = data[
    (data["Completed"] == False) & (data["SeasonType"] != 'postseason')
][["HomeTeam", "AwayTeam", "Week"]].copy()

# Merge ratings for home/away
team_ratings = dict(zip(massey_df["Team"], massey_df["MasseyRating"]))

def get_line(home, away):
    home_rating = team_ratings.get(home, 0)
    away_rating = team_ratings.get(away, 0)
    diff = home_rating - away_rating
    # Round to nearest 0.5 like betting lines
    spread = round(diff * 2) / 2.0
    if spread > 0:
        line = f"{home} -{abs(spread)}"
    elif spread < 0:
        line = f"{away} -{abs(spread)}"
    else:
        line = "Pick'em"
    return line, spread

remaining_games["Line"], remaining_games["Spread"] = zip(
    *remaining_games.apply(lambda row: get_line(row["HomeTeam"], row["AwayTeam"]), axis=1)
)

# Save to CSV
output_path = "cfb_betting_lines.csv"
remaining_games.to_csv(output_path, index=False)

remaining_games[remaining_games["Week"]==4]


Unnamed: 0,HomeTeam,AwayTeam,Week,Line,Spread
803,Benedict College,Tuskegee,4,Tuskegee -6.5,-6.5
804,Charlotte,Rice,4,Rice -12.5,-12.5
805,Assumption,St. Anselm,4,Assumption -5.5,5.5
806,Lafayette,Columbia,4,Lafayette -8.5,8.5
807,Oklahoma State,Tulsa,4,Tulsa -1.5,-1.5
808,Rutgers,Iowa,4,Rutgers -4.0,4.0
809,Framingham State,UMass Dartmouth,4,UMass Dartmouth -14.0,-14.0
810,South Florida,South Carolina State,4,South Florida -18.0,18.0
811,Army,North Texas,4,North Texas -11.0,-11.0
812,Miami (OH),UNLV,4,UNLV -9.5,-9.5
