In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('historical_corn_herbicide_used.csv')
herb_corn = df[(df['State'] == "Illinois")|(df["State"]=="Iowa")|(df["State"]=="Nebraska")]
herb_corn = herb_corn[herb_corn["Year"]>=2014]
herb_corn

Unnamed: 0,State,Compound,Year,Units,Quantity
2819,Illinois,"2,4-D",2014,kg,378390.7
2820,Illinois,"2,4-D",2015,kg,356897.2
2821,Illinois,"2,4-D",2016,kg,381575.6
2822,Illinois,"2,4-D",2017,kg,308831.0
2845,Illinois,ALACHLOR,2014,kg,223964.5
...,...,...,...,...,...
7235,Nebraska,PARAQUAT,2017,kg,295916.3
7258,Nebraska,PENDIMETHALIN,2014,kg,144308.8
7259,Nebraska,PENDIMETHALIN,2015,kg,163772.9
7260,Nebraska,PENDIMETHALIN,2016,kg,75846.7


In [4]:
corn_prod = pd.read_excel('new_production.xlsx',sheet_name='corn')
corn_prod = corn_prod[(corn_prod['state']=="Illinois")|(corn_prod['state']=="Nebraska")|(corn_prod['state']=="Iowa")]
corn_prod = corn_prod.iloc[:,:5]

corn_prod = corn_prod.melt(id_vars=['state'],var_name='year',value_name="yield",value_vars=['yield per acre_2014','yield per acre_2015','yield per acre_2016','yield per acre_2017'])
corn_prod['year'] = corn_prod['year'].apply(lambda x: x.split('_')[1]).astype(int)

In [6]:

# Read in the production DataFrame
production_df = pd.read_csv('production.csv')

# Group the DataFrame by State, Year, and Compound, and sum the Quantity column
grouped = herb_corn.groupby(['State', 'Year', 'Compound'])['Quantity'].sum().reset_index()

# Join the herbicide usage DataFrame with the production DataFrame on the 'State' and 'Year' columns
for state in ['Illinois', 'Iowa', 'Nebraska']:
    merged_df = pd.merge(grouped, corn_prod, how='left', left_on=['State', 'Year'], right_on=['state', 'year'])
    merged_df.drop(['state', 'year'], axis=1, inplace=True)
    merged_df = merged_df[merged_df['State'] == state]
    merged_df.drop('State',axis=1,inplace=True)

    # use linear regression to predict effect of each compound on yield
    X = merged_df.drop('yield',axis=1)
    y = merged_df['yield']

    X = pd.get_dummies(X,columns=['Compound'],drop_first=False)

    X.drop("Year",axis=1,inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    regr = LinearRegression(positive=True)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)


    coef_std = regr.coef_

    coef = coef_std

    # The coefficients
    print("###############{}##############".format(state))
    for c in X.columns:
        print(c)
    print('Intercept:{:.4f} \n'.format(regr.intercept_))

    for c in X.columns:
        print(coef[X.columns.get_loc(c)])

    print('Mean squared error: %.2f'
        % mean_squared_error(y_test, y_pred))
    print('Coefficient of determination: %.2f'
        % r2_score(y_test, y_pred))
    

###############Illinois##############
Quantity
Compound_2,4-D
Compound_ALACHLOR
Compound_ATRAZINE
Compound_BROMOXYNIL
Compound_DICAMBA
Compound_DIFLUFENZOPYR
Compound_EPTC
Compound_FORAMSULFURON
Compound_GLYPHOSATE
Compound_HALOSULFURON
Compound_NICOSULFURON
Compound_PARAQUAT
Compound_PENDIMETHALIN
Intercept:175.0000 

0.0
15.999999999999991
23.499999999999982
24.33333333333332
18.249999999999993
16.999999999999975
18.249999999999982
15.999999999999982
12.499999999999977
25.49999999999998
24.333333333333318
0.0
16.999999999999993
25.499999999999982
Mean squared error: 337.65
Coefficient of determination: -1.66
###############Iowa##############
Quantity
Compound_2,4-D
Compound_ALACHLOR
Compound_ATRAZINE
Compound_BROMOXYNIL
Compound_DICAMBA
Compound_DIFLUFENZOPYR
Compound_EPTC
Compound_FORAMSULFURON
Compound_GLYPHOSATE
Compound_HALOSULFURON
Compound_NICOSULFURON
Compound_PARAQUAT
Compound_PENDIMETHALIN
Intercept:178.0000 

0.0
0.0
21.000000000000014
12.666666666666666
12.000000000000004
