In [4]:
from pyspark.sql import SparkSession
import numpy as np

In [5]:
spark = SparkSession.builder.appName("CSCI316-process_three_logisticRegression(simple)") \
.config("spark-master", "local") \
.getOrCreate()

df_FD = spark \
.read \
.format("csv") \
.option("header", "true").load("block_1.csv")

df_FD.printSchema()
df_FD.show(10)

root
 |-- id_1: string (nullable = true)
 |-- id_2: string (nullable = true)
 |-- cmp_fname_c1: string (nullable = true)
 |-- cmp_fname_c2: string (nullable = true)
 |-- cmp_lname_c1: string (nullable = true)
 |-- cmp_lname_c2: string (nullable = true)
 |-- cmp_sex: string (nullable = true)
 |-- cmp_bd: string (nullable = true)
 |-- cmp_bm: string (nullable = true)
 |-- cmp_by: string (nullable = true)
 |-- cmp_plz: string (nullable = true)
 |-- is_match: string (nullable = true)

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|     cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
|37291|53113|0.833333333333333|           ?|           1|           ?|      1|     1|     1|     1|      0|    TRUE|
|39086|47614|                1|           ?|  

In [6]:
##following is to do preprocessing
from pyspark.sql.functions import when   
from pyspark.sql.functions import regexp_replace,col
df_FD = df_FD.withColumn('is_match', regexp_replace(col('is_match'), "FALSE", "1"))
df_FD = df_FD.withColumn('is_match', regexp_replace(col('is_match'), "TRUE", "0"))

In [7]:
#convert each tuples of RDD to list
rdd1 = df_FD.rdd.map(list)
rdd1.first()

['37291',
 '53113',
 '0.833333333333333',
 '?',
 '1',
 '?',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0']

In [9]:
#delete first two columns and covert "?" to integer 2 ,otherwise float it. 
def preprocessing(pieces):

    scores = [ 2 if p=='?' else float(p) for p in pieces[2:12]]
    
    return scores

dataset = rdd1.map(lambda x: preprocessing(x)).collect()

record_linkage = np.array(dataset)

X = record_linkage[:, :8]
y=record_linkage[:,9]
y=y.astype(int)
print(X)
print(y)

[[0.83333333 2.         1.         ... 1.         1.         1.        ]
 [1.         2.         1.         ... 1.         1.         1.        ]
 [1.         2.         1.         ... 1.         1.         1.        ]
 ...
 [1.         2.         0.09090909 ... 0.         1.         0.        ]
 [1.         2.         0.11111111 ... 0.         1.         0.        ]
 [1.         2.         0.         ... 1.         0.         0.        ]]
[0 0 0 ... 1 1 1]


In [13]:
import numpy as np

# GENERATING RANDOM DATA FOR TRAINING TESTING 

np.random.seed(12)
num_observations = 5000


simulated_separableish_features = X
simulated_labels = y


# BUILDING A LINK FUNCTION
def sigmoid(scores):
    return 1 / (1 + np.exp(-scores))


# DEFINING LOG LIKELIHOOD

def log_likelihood(features, target, weights):
    scores = np.dot(features, weights)
    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return ll

# BUILDING MAIN LOGISTIC REGRESSION FUNCTION 

def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):
    if add_intercept:
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
        
    weights = np.zeros(features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(features, weights)
        predictions = sigmoid(scores)

        # Update weights with log likelihood gradient
        output_error_signal = target - predictions
        
        gradient = np.dot(features.T, output_error_signal)
        weights += learning_rate * gradient

        # Print log-likelihood every so often
        if step % 10000 == 0:
            print (log_likelihood(features, target, weights))
        
    return weights

# WIEGHTS FOR LOGISTIC REGRESSION BUILT FROM SCRATCH

weights = logistic_regression(simulated_separableish_features, simulated_labels,
                     num_steps = 50000, learning_rate = 5e-5, add_intercept=True)
print ("LOGISTIC REGRESSION FROM SRATCH WEIGHTS => ",weights)


final_scores = np.dot(np.hstack((np.ones((simulated_separableish_features.shape[0], 1)),
                                 simulated_separableish_features)), weights)

preds = np.round(sigmoid(final_scores))

label = simulated_labels

pred = preds
           
conf_mat = np.zeros([2, 2])
for i in range(len(pred)):

    row = int(1 - label[i])
    col = int(1 - pred[i])
    conf_mat[row][col] += 1

TP = conf_mat[0][0]
FP = conf_mat[1][0]
FN = conf_mat[0][1]
TN = conf_mat[1][1]
P = conf_mat[0].sum()
N = conf_mat[1].sum()
All = P + N
Precision=TP / (TP + FP)
Recall=TP /(TP + FN)
    
print(" ")
print("Confusion matrix:")
print("\t", conf_mat[0])
print("\t", conf_mat[1])
print("\tAcc: ", (TP + TN) / All)
print("\tPrecision : ", TP / (TP + FP))
print("\tRecall: ",TP /(TP + FN))
print("\tF1-score: ",2*(Recall * Precision) / (Recall + Precision))
print("-------------------------")
print()

print ('Accuracy from scratch: {0}'.format((preds == simulated_labels).sum().astype(float) / len(preds)))


-348453.66541805235
-577.4347979294267
-548.714542254037
-538.3829217196156
-533.4215045056905
LOGISTIC REGRESSION FROM SRATCH WEIGHTS =>  [ 19.88446497  -7.81630299   0.40043797 -17.50293336   4.81276592
   0.09469743  -3.22698096  -1.53608846  -3.31022303]
 
Confusion matrix:
	 [5.72727e+05 9.30000e+01]
	 [  24. 2069.]
	Acc:  0.9997964909473259
	Precision :  0.9999580969740778
	Recall:  0.9998376453336126
	F1-score:  0.9998978675263254
-------------------------

Accuracy from scratch: 0.9997964909473259
