In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler

In [16]:
class LogisticRegression:

    def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
        """
        Parameters:
        - penalty: str, "l1" or "l2". Determines the regularization to be used.
        - gamma: float, regularization coefficient. Used in conjunction with 'penalty'.
        - fit_intercept: bool, whether to add an intercept (bias) term.
        """
        err_msg = "penalty must be 'l1' or 'l2', but got: {}".format(penalty)#汇报错误
        assert penalty in ["l2", "l1"], err_msg
        self.penalty = penalty
        self.gamma = gamma
        self.fit_intercept = fit_intercept#是否加入截距项
        self.coef_ = None

    def sigmoid(self, x):
        return 1/(np.exp(-x)+1)

    def get_gradient(self, X, y, coef_):
        return np.dot(X.T, (self.sigmoid(np.dot(X, coef_)) - y))


    def fit(self, X, y, lr=0.01, tol=1e-7, max_iter=1e5,decay=0.75):#fit的意思是拟合参数，此处使用梯度下降法
        '''
        :param X:
        :param y:
        :param lr:
        :param tol:
        :param max_iter:
        :return losses:
        '''

        if self.fit_intercept:
            X_tilde = np.c_[np.ones(X.shape[0]), X]  # c_是按列连接两个矩阵，np.ones(X.shape[0])是一个全1的矩阵，X是原矩阵
        else:
            X_tilde = X
        # Initialize coefficients
        self.coef_ = np.zeros(X_tilde.shape[1])  # coef_是系数矩阵，初始化为全0矩阵
        
        # List to store loss values at each iteration
        losses = []
        y_pred=self.sigmoid(np.dot(X_tilde,self.coef_))#

        for i in range(int(max_iter)):

            loss=-y*np.dot(X_tilde, self.coef_)+np.log(1+np.exp(np.dot(X_tilde,self.coef_)))
            loss=loss.sum()
            losses.append(loss)

            if self.penalty=='l2':
                self.coef_ = self.coef_ - lr * (self.get_gradient(X_tilde, y, self.coef_)+self.gamma*self.coef_)
            else:
                self.coef_ = self.coef_ - lr * (self.get_gradient(X_tilde, y, self.coef_)+self.gamma*np.sign(self.coef_))
            y_pred = self.sigmoid(np.dot(X_tilde, self.coef_))

            print(f'    iteration:{i},    loss:{loss:.2e}')
            if i>1 and losses[-2]-losses[-1]<0:
                lr=lr*decay
            if lr<tol:
                break

        return losses

    def predict(self, X):#在已经训练好模型后进行预测，此处使用sigmoid函数
        """
        Use the trained model to generate prediction probabilities on a new
        collection of data points.
        
        Parameters:
        - X: numpy array of shape (n_samples, n_features), input data.
        
        Returns:
        - probs: numpy array of shape (n_samples,), prediction probabilities.
        """
        if self.fit_intercept:
            X_tilde = np.c_[np.ones(X.shape[0]), X]

        # Compute the linear combination of inputs and weights
        linear_output = np.dot(X_tilde, self.coef_)

        return np.where(self.sigmoid(linear_output)>=0.5,1,0)

    def cal_accuracy(self,y_pred_test,y_test):
        y_pred=y_pred_test
        #返回一个百分数，并保留4位小数，需要带百分号
        return f'accuracy:{100*np.mean(y_pred==y_test):.4f}%'
    def cal_f1_score(self,y_pred_test,y_test):
        y_pred=y_pred_test
        TP=np.sum((y_pred==1)&(y_test==1))
        FP=np.sum((y_pred==1)&(y_test==0))
        FN=np.sum((y_pred==0)&(y_test==1))
        precision=TP/(TP+FP)
        recall=TP/(TP+FN)
        return f'f1_score:{2*precision*recall/(precision+recall):.4f}'

In [17]:
def trainingset_data_processing(path):
    df=pd.read_excel(path)
    # 处理时间戳
    df['Time Stamp']=df['Time Stamp'][:].apply(lambda x:x[:6]+'20'+x[8:])
    # 处理缺失值
    df_null_rate=df.isnull().sum()/len(df)
    delete_list=[]
    # 删除缺失值超过80%的列，并记录下来 （缺失值处理的第一步）
    for i in range(1,df.shape[1]):
        if df_null_rate[i]>0.8:
            delete_list.append(df.columns[i])
    delete_list.append('Nh')
    delete_list.append('H')
    for i in range(len(delete_list)):
        df.drop(delete_list[i],axis=1,inplace=True)
    # 处理labels
    df_labels=df[['RRR','WW','W2']].copy()
    df_labels['WW'].fillna('无',inplace=True)
    df_labels['W2'].replace('阵性','阵雨',inplace=True)
    df_labels['W2'].replace('雷暴，有降水或无','雷暴，有降雨或无',inplace=True)
    df_labels['W2'].fillna('无',inplace=True)
    # 处理RRR(是原始数据上的label,但是有缺失值)
    df_labels['RRR'].fillna(False,inplace=True)#inplace=True表示在原数据上修改,原数据是df_labels
    df_labels['RRR'].replace('无降水',False,inplace=True)
    df_labels['RRR'].replace('降水迹象',True,inplace=True)
    df_labels['RRR'] = df_labels['RRR'].apply(lambda x:True if (isinstance(x, float) or (isinstance(x, int) and not isinstance(x, bool))) else x)
    # 字里行间看出来的label
    df_labels['WW'].astype('str')
    df_labels['W2'].astype('str')
    df['WW']=np.array([df_labels['WW'][i].find('雨')>0 for i in range(df.shape[0])])
    df['W2']=np.array([df_labels['W2'][i].find('雨')>0 for i in range(df.shape[0])])
    df_labels['part_label']=np.array([df_labels['WW'][i].find('雨')>0 or df_labels['W2'][i].find('雨')>0  for i in range(df_labels.shape[0])])
    # 合并label
    df_labels['real_label']=df_labels['RRR']|df_labels['part_label']
    df_labels['RRR_raw']=df['RRR'].copy()
    #调整位置，把RRR_raw放在第一列
    df_labels=df_labels[['RRR_raw','RRR','WW','W2','part_label','real_label']]
    # 取保时间戳是datetime格式
    df['Time Stamp'] = pd.to_datetime(df['Time Stamp'], format='%d.%m.%Y %H:%M')
    # 存入可以被训练的labels
    df['real_label']=df_labels['real_label'].copy()
    # labels转换为0和1，为后续Logistic Regression 做准备
    df['real_label'].replace(True,1,inplace=True)
    df['real_label'].replace(False,0,inplace=True)
    df['VV'].replace('低于 0.1',0,inplace=True) # 目的是把低于0.1替换成0，以减少onhot编码的维度
    # 缺失值处理的第二步
    # 处理缺失值，若是float或int类型，用均值或者中位数填充；若是str，用多项分布进行随机填充
    # 同时要记录下prob，用于测试集的填充
    Mean_4_fillna={}
    Prob_4_fillna={}
    for col in df.columns:
        if df[col].dtype=='float64' or df[col].dtype=='int64':
            Mean_4_fillna[col]=df[col].mean()
            df[col].fillna(df[col].mean(),inplace=True)
            # df[col].fillna(df[col].median(),inplace=True) #用中位数填充
        elif df[col].dtype=='object':
            # 用众数填充
            df[col].fillna(df[col].mode()[0],inplace=True)
            # 记录众数
            Prob_4_fillna[col]=df[col].mode()[0]
            #prob=df[col].value_counts(normalize=True)
            #Prob_4_fillna[col]=prob
            #df[col]=df[col].apply(lambda x:prob.index[np.random.multinomial(1,prob).argmax()] if pd.isnull(x) else x)
    # onehot编码
    df_timestamp=df['Time Stamp'].copy()
    df.drop(['Time Stamp'],axis=1,inplace=True)
    LABELS=df['real_label'].copy()
    df.drop(['real_label'],axis=1,inplace=True)
    df_RRR=df['RRR'].copy()
    df.drop(['RRR'],axis=1,inplace=True)
    # 为onehot编码做准备
    df_RRR.replace('降水迹象',df_RRR[df_RRR.apply(lambda x:isinstance(x,float))].mean(),inplace=True)
    df_RRR.replace('无降水',0,inplace=True)
    
    # onehot编码,并记录下来，为后续测试集的onehot编码做准备
    df_onehot = pd.get_dummies(df,dtype='float64')
    df_dummies=df_onehot
    
    # 合并数据
    df_onehot['Time Stamp']=df_timestamp
    df_onehot['real_label']=LABELS
    df_onehot=df_onehot[['Time Stamp']+list(df_onehot.columns[:-2])+['real_label']]
    
    # 合并小时数据成为一天的数据
    #Time Stamp里面是小时的数据，这里按照天取平均
    df_onehot['Time Stamp']=pd.to_datetime(df_onehot['Time Stamp'],format='%Y-%m-%d %H:%M:%S')
    # 对每天的label进行处理，取最大值
    for col in df_onehot.columns[1:-1]:
        df_onehot_X=df_onehot.groupby(df_onehot['Time Stamp'].dt.date)[col].mean()
        df_onehot[col]=df_onehot['Time Stamp'].dt.date.map(df_onehot_X)
    df_onehot_y=df_onehot.groupby(df_onehot['Time Stamp'].dt.date)['real_label'].max()
    df_onehot['real_label']=df_onehot['Time Stamp'].dt.date.map(df_onehot_y)
    return df_onehot,Mean_4_fillna,Prob_4_fillna,delete_list,df_dummies

In [18]:
def split_train_test(df_onehot):
    split_rate=1
    #划分训练集和测试集
    X_train,X_test,y_train,y_test=df_onehot.iloc[:int(df_onehot.shape[0]*split_rate),1:-1],df_onehot.iloc[int(df_onehot.shape[0]*split_rate):,1:-1],df_onehot.iloc[:int(df_onehot.shape[0]*split_rate),-1],df_onehot.iloc[int(df_onehot.shape[0]*split_rate):,-1]
    # 标准化
    MEAN=X_train.mean()
    STD=X_train.std()
    X_train=(X_train-MEAN)/STD
    X_test=(X_test-MEAN)/STD
    # 标准化,并记录scaler，用于测试集的标准化
    #scaler=StandardScaler()
    #X_train=scaler.fit_transform(X_train)
    #X_test=scaler.transform(X_test)
    return X_train,X_test,y_train,y_test,MEAN,STD

In [19]:
df_onehot,Mean_4_fillna,Prob_4_fillna,delete_list,df_dummies=trainingset_data_processing('training_dataset.xls')
pd.DataFrame(df_onehot.iloc[:,-1]).to_csv('labels.csv')
X_train,X_test,y_train,y_test,MEAN,STD=split_train_test(df_onehot)

In [20]:
model=LogisticRegression()
model.fit(X_train,y_train,lr=0.001,max_iter=300,decay=0.75)
#y_pred=model.predict(X_test)
#print(f'test_accuracy:{np.sum(y_pred==y_test)/len(y_test)},f1:{np.sum(y_pred*y_test)/(np.sum(y_pred)+np.sum(y_test)-np.sum(y_pred*y_test))}')

    iteration:0,    loss:2.83e+04
    iteration:1,    loss:1.10e+05
    iteration:2,    loss:7.89e+04
    iteration:3,    loss:8.58e+04
    iteration:4,    loss:1.59e+05
    iteration:5,    loss:1.19e+05
    iteration:6,    loss:5.37e+04
    iteration:7,    loss:3.80e+04
    iteration:8,    loss:3.15e+04
    iteration:9,    loss:2.65e+04
    iteration:10,    loss:2.24e+04
    iteration:11,    loss:1.90e+04
    iteration:12,    loss:1.65e+04
    iteration:13,    loss:1.55e+04
    iteration:14,    loss:1.99e+04
    iteration:15,    loss:4.71e+04
    iteration:16,    loss:4.66e+04
    iteration:17,    loss:2.28e+04
    iteration:18,    loss:1.61e+04
    iteration:19,    loss:1.33e+04
    iteration:20,    loss:1.19e+04
    iteration:21,    loss:1.09e+04
    iteration:22,    loss:1.01e+04
    iteration:23,    loss:9.53e+03
    iteration:24,    loss:9.05e+03
    iteration:25,    loss:8.66e+03
    iteration:26,    loss:8.33e+03
    iteration:27,    loss:8.06e+03
    iteration:28,    loss:7.83

[28262.38314015121,
 110420.72815514854,
 78915.4738254297,
 85848.41047552445,
 158894.70623305687,
 118947.6757566019,
 53731.67261251242,
 38030.587688860855,
 31513.27966279298,
 26483.568847230876,
 22364.265829843935,
 19012.246615107902,
 16513.69713103341,
 15482.20045825149,
 19938.616566908007,
 47055.47495389013,
 46644.99907002318,
 22761.59176058242,
 16130.377420223344,
 13255.223182737012,
 11902.531000723453,
 10901.50526752747,
 10129.02633387796,
 9526.540363093292,
 9047.186590811236,
 8656.083921847743,
 8331.257525771656,
 8059.439023884854,
 7832.39786478707,
 7648.4168315716815,
 7529.638666427059,
 7630.7316990158,
 8905.361912147288,
 10694.145397413155,
 9687.433536836255,
 7418.963782271492,
 7223.360237281322,
 7133.989689480433,
 7072.770084913591,
 7025.620161290734,
 6987.368245862915,
 6955.315980514968,
 6927.8248134838395,
 6903.844587175918,
 6882.682415640126,
 6863.8668805730595,
 6847.0641376156,
 6832.025848209924,
 6818.557118886923,
 6806.496984

In [21]:
import json

# 假设`model`是您训练好的LogisticRegression模型实例
model_params = {
    'coef': model.coef_.tolist(),  # 将numpy数组转换为列表以进行JSON序列化
    'penalty': model.penalty,
    'gamma': model.gamma,
    'fit_intercept': model.fit_intercept,
    'MEAN': MEAN.tolist(),
    'STD': STD.tolist(),
    'Mean_4_fillna': Mean_4_fillna.tolist() if isinstance(Mean_4_fillna, pd.Series) else Mean_4_fillna,
    'Prob_4_fillna' : {key: value for key, value in Prob_4_fillna.items()} if isinstance(Prob_4_fillna, dict) else Prob_4_fillna,
    'delete_list': delete_list
}

# 保存为JSON
with open('model_params.json', 'w') as file:
    json.dump(model_params, file)