#### IMPORT SPARK

In [1]:
import findspark

In [2]:
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')

In [3]:
import pyspark
from pyspark.sql import SparkSession
import csv

In [4]:
sc = pyspark.SparkContext()

In [5]:
pyspark

<module 'pyspark' from '/home/cse587/spark-2.4.0-bin-hadoop2.7/python/pyspark/__init__.py'>

In [6]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

#### INPUT DATA PROCESSING

In [7]:
import pandas as pd
pandas_df = pd.read_csv("train.csv")
#RDD = sc.textFile("train.csv")

In [8]:
spark_df = sqlContext.createDataFrame(pandas_df)
rdd_df = spark_df.rdd

In [9]:
spark_df.show(3)

+--------+----------------+--------------------+--------------------+
|movie_id|      movie_name|                plot|               genre|
+--------+----------------+--------------------+--------------------+
|23890098|      Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|
|31186339|The Hunger Games|The nation of Pan...|['Action/Adventur...|
|20663735|      Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|
+--------+----------------+--------------------+--------------------+
only showing top 3 rows



In [10]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [11]:
tokenizer = Tokenizer(inputCol = 'plot', outputCol = 'plot_terms')

In [12]:
regex_tokenizer = RegexTokenizer(inputCol = 'plot', outputCol = 'plot_terms', pattern = '\\W')

In [13]:
count_tokens = udf(lambda plot_terms: len(plot_terms), IntegerType())

In [14]:
tokenized = tokenizer.transform(spark_df)

In [15]:
#tokenized.show(1)
tokenized.select("plot_terms").show(2)

+--------------------+
|          plot_terms|
+--------------------+
|[shlykov,, a, har...|
|[the, nation, of,...|
+--------------------+
only showing top 2 rows



In [16]:
regex_tokenized = regex_tokenizer.transform(spark_df)
regex_tokenized.show(2)

+--------+----------------+--------------------+--------------------+--------------------+
|movie_id|      movie_name|                plot|               genre|          plot_terms|
+--------+----------------+--------------------+--------------------+--------------------+
|23890098|      Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[shlykov, a, hard...|
|31186339|The Hunger Games|The nation of Pan...|['Action/Adventur...|[the, nation, of,...|
+--------+----------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [17]:
from pyspark.ml.feature import StopWordsRemover

In [18]:
remover = StopWordsRemover(inputCol = 'plot_terms', outputCol = 'filtered')

In [19]:
stopword_tokenized_withnum = remover.transform(regex_tokenized)

In [20]:
stopword_tokenized_withnum.show(2)

+--------+----------------+--------------------+--------------------+--------------------+--------------------+
|movie_id|      movie_name|                plot|               genre|          plot_terms|            filtered|
+--------+----------------+--------------------+--------------------+--------------------+--------------------+
|23890098|      Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[shlykov, a, hard...|[shlykov, hard, w...|
|31186339|The Hunger Games|The nation of Pan...|['Action/Adventur...|[the, nation, of,...|[nation, panem, c...|
+--------+----------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [21]:
#stopword_tokenized_withnum.select('filtered').show(2, truncate=False)

In [22]:
def is_not_digit(rec):
    list1 = []
    for i in rec:
        if i.isdigit():
            continue;
        else:
            list1.append(i)
    return list1
            
stopword_without_numbers = stopword_tokenized_withnum.rdd.map(lambda x: [x[0],x[1],x[2],x[3],x[4],is_not_digit(x[5])])#, x[2], x[3]])
stopword_tokenized= sqlContext.createDataFrame(stopword_without_numbers,['movie_id','movie_name','plot','genre','plot_terms','filtered'])
stopword_tokenized.show(5)

+--------+------------------+--------------------+--------------------+--------------------+--------------------+
|movie_id|        movie_name|                plot|               genre|          plot_terms|            filtered|
+--------+------------------+--------------------+--------------------+--------------------+--------------------+
|23890098|        Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[shlykov, a, hard...|[shlykov, hard, w...|
|31186339|  The Hunger Games|The nation of Pan...|['Action/Adventur...|[the, nation, of,...|[nation, panem, c...|
|20663735|        Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|[poovalli, induch...|[poovalli, induch...|
| 2231378|The Lemon Drop Kid|The Lemon Drop Ki...|          ['Comedy']|[the, lemon, drop...|[lemon, drop, kid...|
|  595909| A Cry in the Dark|Seventh-day Adven...|['Crime Fiction',...|[seventh, day, ad...|[seventh, day, ad...|
+--------+------------------+--------------------+--------------------+-----------------

In [23]:
stopword_tokenized.select('filtered').show(2, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
from pyspark.ml.feature import Word2Vec

In [27]:
w2v = Word2Vec(vectorSize=350, minCount=5, inputCol="filtered", outputCol="rawfeatures")

In [28]:
doc2vec_model = w2v.fit(stopword_tokenized)
cv_df = doc2vec_model.transform(stopword_tokenized)

#### THIS IS FOR CREATING TF USING HASHINGTF or COUNTVECTORIZER

In [27]:
 #from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [28]:
 #hashing_tf = HashingTF(inputCol = 'rawfeatures', outputCol = 'features')

In [29]:
#USES HASHING TO CALCULATE TF. A BETTER APPROACH BUT CONFUSING.
# featurized_df = hashing_tf.transform(stopword_tokenized)
# featurized_df.show(2)

In [30]:
# featurized_df.show(2)

In [31]:
#from pyspark.ml.feature import CountVectorizer,IDF

In [32]:
#cv = CountVectorizer(inputCol = 'filtered', vocabSize = 10000, minDF = 5.0, outputCol = 'raw_features')

In [33]:
#cv_model = cv.fit(stopword_tokenized)
#cv_df = cv_model.transform(stopword_tokenized)

In [34]:
#cv_df.select("raw_features").show(2, truncate = False)

In [35]:
#cv_df.take(1)

In [36]:
#idf = IDF(inputCol = 'raw_features', outputCol = 'features')
#idf_model = idf.fit(cv_df)
#final_features = idf_model.transform(cv_df)
#final_features = final_features.select("genre","features")

#final_features = cv_df.select("genre", "features")

In [29]:
final_features = cv_df.select("genre", "rawfeatures").withColumnRenamed('rawfeatures','features')
final_features.show()

+--------------------+--------------------+
|               genre|            features|
+--------------------+--------------------+
|['World cinema', ...|[-0.0050093183458...|
|['Action/Adventur...|[0.00205552355450...|
|['Musical', 'Acti...|[0.00542103223440...|
|          ['Comedy']|[-2.4456956043942...|
|['Crime Fiction',...|[-0.0159982558594...|
|['Action/Adventur...|[-0.0444203289753...|
|['Thriller', 'Dra...|[-0.0241131160476...|
|           ['Drama']|[-0.0371631226276...|
|['Black-and-white...|[-0.0205838167644...|
|['Animation', 'Sh...|[-0.0116436669437...|
|          ['Comedy']|[-0.0257132552618...|
|['Crime Fiction',...|[-0.0026438752506...|
|          ['Comedy']|[-0.0410785525968...|
|          ['Comedy']|[-0.0248786258935...|
|          ['Horror']|[-0.0306048169959...|
|['Crime Fiction',...|[-0.0024029585144...|
|           ['Drama']|[-0.0267404576595...|
|['Crime Fiction',...|[-0.0197751616995...|
|  ['Indie', 'Drama']|[-0.0164763600469...|
|           ['Drama']|[-0.019983

In [30]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [31]:
label_stringIdx = StringIndexer(inputCol = 'genre', outputCol = 'label')

In [32]:
pipeline = Pipeline(stages=[label_stringIdx])

In [33]:
# pipelineFit = pipeline.fit(final_features)
dataset = pipeline.fit(final_features).transform(final_features)
dataset.show(5)

+--------------------+--------------------+------+
|               genre|            features| label|
+--------------------+--------------------+------+
|['World cinema', ...|[-0.0050093183458...|   4.0|
|['Action/Adventur...|[0.00205552355450...|1603.0|
|['Musical', 'Acti...|[0.00542103223440...| 316.0|
|          ['Comedy']|[-2.4456956043942...|   1.0|
|['Crime Fiction',...|[-0.0159982558594...| 113.0|
+--------------------+--------------------+------+
only showing top 5 rows



In [34]:
test = final_features


In [35]:
def splitter(rec):
    rec = rec.replace('[','')
    rec = rec.replace(']','')
    rec = rec.replace('\'','')
#     rec = [item.replace("[", "") for item in rec]
#     rec = [item.replace("]", "") for item in rec]
#     rec = [item.replace("\'", "") for item in rec]
    return rec
test1 = test.rdd.map(lambda x: [splitter(x[0]), x[1]])#, x[2], x[3]])
test2= sqlContext.createDataFrame(test1,['genre','features'])
test2.show(5)

+--------------------+--------------------+
|               genre|            features|
+--------------------+--------------------+
| World cinema, Drama|[-0.0050093183458...|
|Action/Adventure,...|[0.00205552355450...|
|Musical, Action, ...|[0.00542103223440...|
|              Comedy|[-2.4456956043942...|
|Crime Fiction, Wo...|[-0.0159982558594...|
+--------------------+--------------------+
only showing top 5 rows



In [36]:
import pyspark.sql.functions as f
test3 = test2.select("features", f.split('genre',',').alias("genre"), f.posexplode(f.split("genre",",")).alias("pos", "string_label"))
test3.show(2)

+--------------------+--------------------+---+------------+
|            features|               genre|pos|string_label|
+--------------------+--------------------+---+------------+
|[-0.0050093183458...|[World cinema,  D...|  0|World cinema|
|[-0.0050093183458...|[World cinema,  D...|  1|       Drama|
+--------------------+--------------------+---+------------+
only showing top 2 rows



In [35]:
#dataset = test3.select("features", f.concat(f.lit("genre"),f.col('pos').cast("string")).alias("name"),f.expr("genre[pos]").alias("string_label")).groupBy("features").pivot("name").agg(f.first("string_label"))


In [36]:
#dataset.show(1, truncate=False)
#dataset.show(1)

In [37]:
final_dataset = test3.select("features", "string_label")

In [38]:
final_dataset.show(4)

+--------------------+----------------+
|            features|    string_label|
+--------------------+----------------+
|[-0.0050093183458...|    World cinema|
|[-0.0050093183458...|           Drama|
|[0.00205552355450...|Action/Adventure|
|[0.00205552355450...|          Action|
+--------------------+----------------+
only showing top 4 rows



In [39]:
label_stringIdx = StringIndexer(inputCol = 'string_label', outputCol = 'label')

In [40]:
# final_dataset1 = label_stringIdx.fit(fd2).transform(fd2)
# final_dataset1.show(5)

In [40]:
def label_mod(rec):
    rec = rec.strip()
    return rec
fd1_rdd = final_dataset.rdd.map(lambda x: [x[0],label_mod(x[1])])#, x[2], x[3]])
fd2= sqlContext.createDataFrame(fd1_rdd,['features','string_label'])
fd2.show(5)

+--------------------+----------------+
|            features|    string_label|
+--------------------+----------------+
|[-0.0050093183458...|    World cinema|
|[-0.0050093183458...|           Drama|
|[0.00205552355450...|Action/Adventure|
|[0.00205552355450...|          Action|
|[0.00205552355450...| Science Fiction|
+--------------------+----------------+
only showing top 5 rows



In [41]:
final_dataset2 = label_stringIdx.fit(fd2).transform(fd2)
final_dataset2.show(5)

+--------------------+----------------+-----+
|            features|    string_label|label|
+--------------------+----------------+-----+
|[-0.0050093183458...|    World cinema|  5.0|
|[-0.0050093183458...|           Drama|  0.0|
|[0.00205552355450...|Action/Adventure| 10.0|
|[0.00205552355450...|          Action|  4.0|
|[0.00205552355450...| Science Fiction| 17.0|
+--------------------+----------------+-----+
only showing top 5 rows



In [42]:
pivotDF = final_dataset2.groupBy("features").pivot("string_label").sum('label')
#pivotDF.show(10)

In [43]:
from pyspark.sql.functions import when
cols = pivotDF.columns[1:]
#cols

for col in cols:
    pivotDF = pivotDF.withColumn(col, when(pivotDF[col]>=0,1).otherwise(0))

In [45]:
#pivotDF.show(10)

In [45]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='label',featuresCol='features', maxIter = 100)

In [46]:
train = pivotDF

In [47]:
train_1 = train.select('features','Drama').withColumnRenamed("Drama","label") 
lr_model_1 = lr.fit(train_1)
#train_test_1 = train_test.select('features','Drama').withColumnRenamed("Drama","label") 
#predictions_1 = lr_model_1.transform(train_test_1)

In [47]:
#train_test_1.show()

In [48]:
#predictions_1.show(3)

In [47]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#evaluator = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction")
#print("Test Accuracy: " + str(evaluator.evaluate(predictions_1)))

Test Accuracy: 0.7890092774232162


In [50]:
#### Model- 2 - Comedy

In [48]:
train_2 = train.select('features','Comedy').withColumnRenamed("Comedy","label") 
lr_model_2 = lr.fit(train_2)
#train_test_2 = train_test.select('features','Comedy').withColumnRenamed("Comedy","label") 
#predictions_2 = lr_model_2.transform(train_test_2)

In [56]:
#evaluator = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction")
#print("Test Accuracy: " + str(evaluator.evaluate(predictions_2)))

Test Accuracy: 0.7527455608626785


In [65]:
#### Model -3 - Romance Film

In [49]:
train_3 = train.select('features','Romance Film').withColumnRenamed("Romance Film","label") 
lr_model_3 = lr.fit(train_3)
#train_test_3 = train_test.select('features','Romance Film').withColumnRenamed("Romance Film","label") 
#predictions_3 = lr_model_3.transform(train_test_3)

In [67]:
#evaluator = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction")
#print("Test Accuracy: " + str(evaluator.evaluate(predictions_3)))

In [68]:
#### Model -4 - Thriller

In [50]:
train_4 = train.select('features','Thriller').withColumnRenamed("Thriller","label") 
lr_model_4 = lr.fit(train_4)
#train_test_4 = train_test.select('features','Thriller').withColumnRenamed("Thriller","label") 
#predictions_4 = lr_model_4.transform(train_test_4)

In [70]:
#evaluator = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction")
#print("Test Accuracy: " + str(evaluator.evaluate(predictions_4)))

In [71]:
#### Model -5 - Action

In [51]:
train_5 = train.select('features','Action').withColumnRenamed("Action","label") 
lr_model_5 = lr.fit(train_5)
#train_test_5 = train_test.select('features','Action').withColumnRenamed("Action","label") 
#predictions_5 = lr_model_5.transform(train_test_5)

In [73]:
#### Model -6 - World cinema

In [52]:
train_6 = train.select('features','World cinema').withColumnRenamed("World cinema","label") 
lr_model_6 = lr.fit(train_6)
#train_test_6 = train_test.select('features','World cinema').withColumnRenamed("World cinema","label") 
#predictions_6 = lr_model_6.transform(train_test_6)

In [75]:
#### Model -7 - Crime Fiction

In [53]:
train_7 = train.select('features','Crime Fiction').withColumnRenamed("Crime Fiction","label") 
lr_model_7 = lr.fit(train_7)
#train_test_7 = train_test.select('features','Crime Fiction').withColumnRenamed("Crime Fiction","label") 
#predictions_7 = lr_model_7.transform(train_test_7)

In [77]:
#### Model -8 - Horror

In [54]:
train_8 = train.select('features','Horror').withColumnRenamed("Horror","label") 
lr_model_8 = lr.fit(train_8)
#train_test_8 = train_test.select('features','Horror').withColumnRenamed("Horror","label") 
#predictions_8 = lr_model_8.transform(train_test_8)

In [79]:
#### Model -9 - Black-and-white

In [55]:
train_9 = train.select('features','Black-and-white').withColumnRenamed("Black-and-white","label") 
lr_model_9 = lr.fit(train_9)

In [81]:
#### Model - 10 -Indie

In [56]:
train_10 = train.select('features','Indie').withColumnRenamed("Indie","label") 
lr_model_10 = lr.fit(train_10)

In [83]:
#### Model - 11 -Action/Adventure

In [57]:
train_11 = train.select('features','Action/Adventure').withColumnRenamed("Action/Adventure","label") 
lr_model_11 = lr.fit(train_11)

In [85]:
#### Model - 12 -Adventure

In [58]:
train_12 = train.select('features','Adventure').withColumnRenamed("Adventure","label") 
lr_model_12 = lr.fit(train_12)

In [87]:
#### Model - 13 - Family Film

In [59]:
train_13 = train.select('features','Family Film').withColumnRenamed("Family Film","label") 
lr_model_13 = lr.fit(train_13)

In [89]:
#### Model - 14 -Short Film

In [60]:
train_14 = train.select('features','Short Film').withColumnRenamed("Short Film","label") 
lr_model_14 = lr.fit(train_14)

In [91]:
#### Model -15 - Romantic drama

In [61]:
train_15 = train.select('features','Romantic drama').withColumnRenamed("Romantic drama","label") 
lr_model_15 = lr.fit(train_15)

In [93]:
#### Model -16 - Animation

In [62]:
train_16 = train.select('features','Animation').withColumnRenamed("Animation","label") 
lr_model_16 = lr.fit(train_16)

In [95]:
#### Model -17 - Musical

In [63]:
train_17 = train.select('features','Musical').withColumnRenamed("Musical","label") 
lr_model_17 = lr.fit(train_17)

In [97]:
#### Model -18 -Science Fiction

In [64]:
train_18 = train.select('features','Science Fiction').withColumnRenamed("Science Fiction","label") 
lr_model_18 = lr.fit(train_18)

In [99]:
#### Model -19 -Mystery

In [66]:
train_19 = train.select('features','Mystery').withColumnRenamed("Mystery","label") 
lr_model_19 = lr.fit(train_19)

In [67]:
#### Model -20 -Romantic comedy

In [68]:
train_20 = train.select('features','Romantic comedy').withColumnRenamed("Romantic comedy","label") 
lr_model_20 = lr.fit(train_20)

# Test Data

In [69]:
test_pandas_df = pd.read_csv("test.csv")

In [70]:
test_spark_df = sqlContext.createDataFrame(test_pandas_df)
test_rdd_df = test_spark_df.rdd

In [71]:
test_spark_df.show(3)

+--------+--------------------+--------------------+
|movie_id|          movie_name|                plot|
+--------+--------------------+--------------------+
| 1335380|              Exodus|The film is based...|
|29062594|A la salida nos v...|A group of teenag...|
| 9252321|   Come Back, Africa|This story of a Z...|
+--------+--------------------+--------------------+
only showing top 3 rows



In [72]:
test_regex_tokenized = regex_tokenizer.transform(test_spark_df)

In [73]:
test_stopword_tokenized_withnum = remover.transform(test_regex_tokenized)

In [74]:
test_stopword_tokenized_withnum.show()

+--------+--------------------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|          plot_terms|            filtered|
+--------+--------------------+--------------------+--------------------+--------------------+
| 1335380|              Exodus|The film is based...|[the, film, is, b...|[film, based, eve...|
|29062594|A la salida nos v...|A group of teenag...|[a, group, of, te...|[group, teenagers...|
| 9252321|   Come Back, Africa|This story of a Z...|[this, story, of,...|[story, zulu, fam...|
|13455076|       A Merry Mixup|The Stooges play ...|[the, stooges, pl...|[stooges, play, t...|
|24165951|        Getting Even|A soldier-of-fort...|[a, soldier, of, ...|[soldier, fortune...|
| 1925869|  River of No Return|Set in the Northw...|[set, in, the, no...|[set, northwester...|
|10799612|          Amici miei|Like in many othe...|[like, in, many, ...|[like, many, moni...|
|28238240|Mickey's Big Game...|Mickey and the Sc..

In [75]:
def is_not_digit(rec):
    list1 = []
    for i in rec:
        if i.isdigit():
            continue;
        else:
            list1.append(i)
    return list1
            
test_stopword_without_numbers = test_stopword_tokenized_withnum.rdd.map(lambda x: [x[0],x[1],x[2],x[3],is_not_digit(x[4])])#, x[2], x[3]])
test_stopword_tokenized= sqlContext.createDataFrame(test_stopword_without_numbers,['movie_id','movie_name','plot','plot_terms','filtered'])
test_stopword_tokenized.show(5)

+--------+--------------------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|          plot_terms|            filtered|
+--------+--------------------+--------------------+--------------------+--------------------+
| 1335380|              Exodus|The film is based...|[the, film, is, b...|[film, based, eve...|
|29062594|A la salida nos v...|A group of teenag...|[a, group, of, te...|[group, teenagers...|
| 9252321|   Come Back, Africa|This story of a Z...|[this, story, of,...|[story, zulu, fam...|
|13455076|       A Merry Mixup|The Stooges play ...|[the, stooges, pl...|[stooges, play, t...|
|24165951|        Getting Even|A soldier-of-fort...|[a, soldier, of, ...|[soldier, fortune...|
+--------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [110]:
#test_cv = CountVectorizer(inputCol = 'filtered', vocabSize = 10000, minDF = 5.0, outputCol = 'raw_features')

In [111]:
#test_cv_model = test_cv.fit(test_stopword_tokenized)
#test_cv_df = test_cv_model.transform(test_stopword_tokenized)

In [112]:
#test_w2v = Word2Vec(vectorSize=300, minCount=5, inputCol="filtered", outputCol="features"

In [76]:
#test_doc2vec_model = w2v.fit(test_stopword_tokenized)
test_cv_df = doc2vec_model.transform(test_stopword_tokenized)

In [114]:
#test_idf = IDF(inputCol = 'raw_features', outputCol = 'features')
#test_idf_model = test_idf.fit(test_cv_df)
#test_final_features_all = idf_model.transform(test_cv_df)

In [79]:
test_final_features = test_cv_df.select("rawfeatures").withColumnRenamed('rawfeatures','features')

In [80]:
test_final_features.show()

+--------------------+
|            features|
+--------------------+
|[4.60732558957860...|
|[-0.0281006262745...|
|[-0.0033151158445...|
|[-0.0251178558541...|
|[0.02676017268095...|
|[-0.0044664675048...|
|[-0.0247545663940...|
|[-0.0110064580637...|
|[0.00185911922366...|
|[-0.0681458557373...|
|[-0.0167336908851...|
|[-0.0555674358471...|
|[2.16267181163893...|
|[-0.0185805701486...|
|[-0.0038303944157...|
|[-0.0099951453892...|
|[-0.0030056643691...|
|[-0.0152017962006...|
|[0.03550026840631...|
|[-0.0036234850048...|
+--------------------+
only showing top 20 rows



In [81]:

test_final_1 = test_final_features.withColumn('label',f.lit(0))
test_final_1.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[4.60732558957860...|    0|
|[-0.0281006262745...|    0|
|[-0.0033151158445...|    0|
|[-0.0251178558541...|    0|
|[0.02676017268095...|    0|
|[-0.0044664675048...|    0|
|[-0.0247545663940...|    0|
|[-0.0110064580637...|    0|
|[0.00185911922366...|    0|
|[-0.0681458557373...|    0|
|[-0.0167336908851...|    0|
|[-0.0555674358471...|    0|
|[2.16267181163893...|    0|
|[-0.0185805701486...|    0|
|[-0.0038303944157...|    0|
|[-0.0099951453892...|    0|
|[-0.0030056643691...|    0|
|[-0.0152017962006...|    0|
|[0.03550026840631...|    0|
|[-0.0036234850048...|    0|
+--------------------+-----+
only showing top 20 rows



In [148]:
test_predictions_1 = lr_model_1.transform(test_final_1)


In [83]:
test_predictions_1 = test_predictions_1.select('features','prediction','label').withColumnRenamed('prediction','column')
test_predictions_1 = test_predictions_1.withColumn("column_1",test_predictions_1["column"].cast('int'))
test_predictions_1 = test_predictions_1.select('features', 'column_1','label')
test_predictions_1.show()

+--------------------+--------+-----+
|            features|column_1|label|
+--------------------+--------+-----+
|[4.60732558957860...|       1|    0|
|[-0.0281006262745...|       1|    0|
|[-0.0033151158445...|       1|    0|
|[-0.0251178558541...|       0|    0|
|[0.02676017268095...|       0|    0|
|[-0.0044664675048...|       1|    0|
|[-0.0247545663940...|       1|    0|
|[-0.0110064580637...|       0|    0|
|[0.00185911922366...|       0|    0|
|[-0.0681458557373...|       0|    0|
|[-0.0167336908851...|       1|    0|
|[-0.0555674358471...|       0|    0|
|[2.16267181163893...|       1|    0|
|[-0.0185805701486...|       0|    0|
|[-0.0038303944157...|       1|    0|
|[-0.0099951453892...|       1|    0|
|[-0.0030056643691...|       0|    0|
|[-0.0152017962006...|       1|    0|
|[0.03550026840631...|       1|    0|
|[-0.0036234850048...|       1|    0|
+--------------------+--------+-----+
only showing top 20 rows



In [84]:
test_final_2 = test_predictions_1

In [85]:
test_predictions_2 = lr_model_2.transform(test_final_2)

In [86]:
test_predictions_2 = test_predictions_2.select('features','column_1','prediction','label').withColumnRenamed('prediction','column')
test_predictions_2 = test_predictions_2.withColumn("column_2",test_predictions_2["column"].cast('int'))
test_predictions_2 = test_predictions_2.select('features', 'column_1','column_2','label')
test_predictions_2.show()

+--------------------+--------+--------+-----+
|            features|column_1|column_2|label|
+--------------------+--------+--------+-----+
|[4.60732558957860...|       1|       0|    0|
|[-0.0281006262745...|       1|       1|    0|
|[-0.0033151158445...|       1|       0|    0|
|[-0.0251178558541...|       0|       1|    0|
|[0.02676017268095...|       0|       0|    0|
|[-0.0044664675048...|       1|       0|    0|
|[-0.0247545663940...|       1|       0|    0|
|[-0.0110064580637...|       0|       1|    0|
|[0.00185911922366...|       0|       0|    0|
|[-0.0681458557373...|       0|       0|    0|
|[-0.0167336908851...|       1|       0|    0|
|[-0.0555674358471...|       0|       0|    0|
|[2.16267181163893...|       1|       0|    0|
|[-0.0185805701486...|       0|       0|    0|
|[-0.0038303944157...|       1|       0|    0|
|[-0.0099951453892...|       1|       0|    0|
|[-0.0030056643691...|       0|       0|    0|
|[-0.0152017962006...|       1|       0|    0|
|[0.035500268

In [87]:
test_final_3 = test_predictions_2

In [88]:
test_predictions_3 = lr_model_3.transform(test_final_3)

In [89]:
test_predictions_3 = test_predictions_3.select('features','column_1','column_2','prediction','label').withColumnRenamed('prediction','column')
test_predictions_3 = test_predictions_3.withColumn("column_3",test_predictions_3["column"].cast('int'))
test_predictions_3 = test_predictions_3.select('features', 'column_1','column_2','column_3','label')
test_predictions_3.show()

+--------------------+--------+--------+--------+-----+
|            features|column_1|column_2|column_3|label|
+--------------------+--------+--------+--------+-----+
|[4.60732558957860...|       1|       0|       0|    0|
|[-0.0281006262745...|       1|       1|       0|    0|
|[-0.0033151158445...|       1|       0|       0|    0|
|[-0.0251178558541...|       0|       1|       0|    0|
|[0.02676017268095...|       0|       0|       0|    0|
|[-0.0044664675048...|       1|       0|       0|    0|
|[-0.0247545663940...|       1|       0|       0|    0|
|[-0.0110064580637...|       0|       1|       0|    0|
|[0.00185911922366...|       0|       0|       0|    0|
|[-0.0681458557373...|       0|       0|       0|    0|
|[-0.0167336908851...|       1|       0|       0|    0|
|[-0.0555674358471...|       0|       0|       0|    0|
|[2.16267181163893...|       1|       0|       0|    0|
|[-0.0185805701486...|       0|       0|       0|    0|
|[-0.0038303944157...|       1|       0|       0

In [90]:
test_final_4 = test_predictions_3
test_predictions_4 = lr_model_4.transform(test_final_4)

In [91]:
test_predictions_4 = test_predictions_4.select('features','column_1','column_2','column_3','prediction','label').withColumnRenamed('prediction','column')
test_predictions_4 = test_predictions_4.withColumn("column_4",test_predictions_4["column"].cast('int'))
test_predictions_4 = test_predictions_4.select('features', 'column_1','column_2','column_3','column_4','label')
#test_predictions_4.show()

In [92]:
test_final_5 = test_predictions_4
test_predictions_5 = lr_model_5.transform(test_final_5)

In [93]:
test_predictions_5 = test_predictions_5.select('features','column_1','column_2','column_3','column_4','prediction','label').withColumnRenamed('prediction','column')
test_predictions_5 = test_predictions_5.withColumn("column_5",test_predictions_5["column"].cast('int'))
test_predictions_5 = test_predictions_5.select('features', 'column_1','column_2','column_3','column_4','column_5','label')

In [94]:
test_final_6 = test_predictions_5
test_predictions_6 = lr_model_6.transform(test_final_6)

In [95]:
test_predictions_6 = test_predictions_6.select('features','column_1','column_2','column_3','column_4','column_5','prediction','label').withColumnRenamed('prediction','column')
test_predictions_6 = test_predictions_6.withColumn("column_6",test_predictions_6["column"].cast('int'))
test_predictions_6 = test_predictions_6.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','label')

In [96]:
test_final_7 = test_predictions_6
test_predictions_7 = lr_model_7.transform(test_final_7)

In [97]:
test_predictions_7 = test_predictions_7.select('features','column_1','column_2','column_3','column_4','column_5','column_6','prediction','label').withColumnRenamed('prediction','column')
test_predictions_7 = test_predictions_7.withColumn("column_7",test_predictions_7["column"].cast('int'))
test_predictions_7 = test_predictions_7.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','label')

In [98]:
test_final_8 = test_predictions_7
test_predictions_8 = lr_model_8.transform(test_final_8)

In [99]:
test_predictions_8 = test_predictions_8.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','prediction','label').withColumnRenamed('prediction','column')
test_predictions_8 = test_predictions_8.withColumn("column_8",test_predictions_8["column"].cast('int'))
test_predictions_8 = test_predictions_8.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','label')

In [100]:
test_final_9 = test_predictions_8
test_predictions_9 = lr_model_9.transform(test_final_9)

In [101]:
test_predictions_9 = test_predictions_9.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','prediction','label').withColumnRenamed('prediction','column')
test_predictions_9 = test_predictions_9.withColumn("column_9",test_predictions_9["column"].cast('int'))
test_predictions_9 = test_predictions_9.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','label')

In [102]:
test_final_10 = test_predictions_9
test_predictions_10 = lr_model_10.transform(test_final_10)

In [103]:
test_predictions_10 = test_predictions_10.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','prediction','label').withColumnRenamed('prediction','column')
test_predictions_10 = test_predictions_10.withColumn("column_10",test_predictions_10["column"].cast('int'))
test_predictions_10 = test_predictions_10.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','label')

In [104]:
test_final_11 = test_predictions_10
test_predictions_11 = lr_model_11.transform(test_final_11)

In [105]:
test_predictions_11 = test_predictions_11.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','prediction','label').withColumnRenamed('prediction','column')
test_predictions_11 = test_predictions_11.withColumn("column_11",test_predictions_11["column"].cast('int'))
test_predictions_11 = test_predictions_11.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','label')

In [106]:
test_final_12 = test_predictions_11
test_predictions_12 = lr_model_12.transform(test_final_12)

In [107]:
test_predictions_12 = test_predictions_12.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','prediction','label').withColumnRenamed('prediction','column')
test_predictions_12 = test_predictions_12.withColumn("column_12",test_predictions_12["column"].cast('int'))
test_predictions_12 = test_predictions_12.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','label')

In [108]:
test_final_13 = test_predictions_12
test_predictions_13 = lr_model_13.transform(test_final_13)

In [109]:
test_predictions_13 = test_predictions_13.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','prediction','label').withColumnRenamed('prediction','column')
test_predictions_13 = test_predictions_13.withColumn("column_13",test_predictions_13["column"].cast('int'))
test_predictions_13 = test_predictions_13.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','label')

In [110]:
test_final_14 = test_predictions_13
test_predictions_14 = lr_model_14.transform(test_final_14)

In [111]:
test_predictions_14 = test_predictions_14.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','prediction','label').withColumnRenamed('prediction','column')
test_predictions_14 = test_predictions_14.withColumn("column_14",test_predictions_14["column"].cast('int'))
test_predictions_14 = test_predictions_14.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','label')

In [112]:
test_final_15 = test_predictions_14
test_predictions_15 = lr_model_15.transform(test_final_15)

In [113]:
test_predictions_15 = test_predictions_15.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','prediction','label').withColumnRenamed('prediction','column')
test_predictions_15 = test_predictions_15.withColumn("column_15",test_predictions_15["column"].cast('int'))
test_predictions_15 = test_predictions_15.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','label')

In [114]:
test_final_16 = test_predictions_15
test_predictions_16 = lr_model_16.transform(test_final_16)

In [115]:
test_predictions_16 = test_predictions_16.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','prediction','label').withColumnRenamed('prediction','column')
test_predictions_16 = test_predictions_16.withColumn("column_16",test_predictions_16["column"].cast('int'))
test_predictions_16 = test_predictions_16.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','label')

In [116]:
test_final_17 = test_predictions_16
test_predictions_17 = lr_model_17.transform(test_final_17)

In [117]:
test_predictions_17 = test_predictions_17.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','prediction','label').withColumnRenamed('prediction','column')
test_predictions_17 = test_predictions_17.withColumn("column_17",test_predictions_17["column"].cast('int'))
test_predictions_17 = test_predictions_17.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','label')

In [118]:
test_final_18 = test_predictions_17
test_predictions_18 = lr_model_18.transform(test_final_18)

In [119]:
test_predictions_18 = test_predictions_18.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','prediction','label').withColumnRenamed('prediction','column')
test_predictions_18 = test_predictions_18.withColumn("column_18",test_predictions_18["column"].cast('int'))
test_predictions_18 = test_predictions_18.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','column_18','label')

In [120]:
test_final_19 = test_predictions_18
test_predictions_19 = lr_model_19.transform(test_final_19)

In [121]:
test_predictions_19 = test_predictions_19.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','column_18','prediction','label').withColumnRenamed('prediction','column')
test_predictions_19 = test_predictions_19.withColumn("column_19",test_predictions_19["column"].cast('int'))
test_predictions_19 = test_predictions_19.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','column_18','column_19','label')

In [122]:
test_final_20 = test_predictions_19
test_predictions_20 = lr_model_20.transform(test_final_20)

In [123]:
test_predictions_20 = test_predictions_20.select('features','column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','column_18','column_19','prediction','label').withColumnRenamed('prediction','column')
test_predictions_20 = test_predictions_20.withColumn("column_20",test_predictions_20["column"].cast('int'))
test_predictions_20 = test_predictions_20.select('features', 'column_1','column_2','column_3','column_4','column_5','column_6','column_7','column_8','column_9','column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','column_18','column_19','column_20','label')

In [124]:
test_predictions_20.show()

+--------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+-----+
|            features|column_1|column_2|column_3|column_4|column_5|column_6|column_7|column_8|column_9|column_10|column_11|column_12|column_13|column_14|column_15|column_16|column_17|column_18|column_19|column_20|label|
+--------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+-----+
|[4.60732558957860...|       1|       0|       0|       0|       0|       0|       0|       0|       0|        0|        0|        0|        0|        0|        0|        0|        0|        0|        0|        0|    0|
|[-0.0281006262745...|       1|       1|       0|       0|       0|       0|       0|       0|       0|        1|       

In [135]:
test_predictions = test_predictions_20.withColumn('predictions',f.concat(f.col('column_1'),f.lit(' '),f.col('column_2'),f.lit(' '),f.col('column_3'),f.lit(' '),f.col('column_4'),f.lit(' '),f.col('column_5'),f.lit(' '),f.col('column_6'),f.lit(' '),f.col('column_7'),f.lit(' '),f.col('column_8'),f.lit(' '),f.col('column_9'),f.lit(' '),f.col('column_10'),f.lit(' '),f.col('column_11'),f.lit(' '),f.col('column_12'),f.lit(' '),f.col('column_13'),f.lit(' '),f.col('column_14'),f.lit(' '),f.col('column_15'),f.lit(' '),f.col('column_16'),f.lit(' '),f.col('column_17'),f.lit(' '),f.col('column_18'),f.lit(' '),f.col('column_19'),f.lit(' '),f.col('column_20')))
test_predictions = test_predictions.select('features','predictions')

In [136]:
test_predictions.show()

+--------------------+--------------------+
|            features|         predictions|
+--------------------+--------------------+
|[4.60732558957860...|1 0 0 0 0 0 0 0 0...|
|[-0.0281006262745...|1 1 0 0 0 0 0 0 0...|
|[-0.0033151158445...|1 0 0 0 0 0 0 0 0...|
|[-0.0251178558541...|0 1 0 0 0 0 0 0 0...|
|[0.02676017268095...|0 0 0 0 0 0 0 0 0...|
|[-0.0044664675048...|1 0 0 0 0 0 0 0 0...|
|[-0.0247545663940...|1 0 0 0 0 0 0 0 0...|
|[-0.0110064580637...|0 1 0 0 0 0 0 0 0...|
|[0.00185911922366...|0 0 0 0 1 0 0 0 0...|
|[-0.0681458557373...|0 0 0 0 0 0 0 0 1...|
|[-0.0167336908851...|1 0 0 0 0 0 0 0 0...|
|[-0.0555674358471...|0 0 0 0 0 0 0 1 0...|
|[2.16267181163893...|1 0 0 0 0 0 0 0 0...|
|[-0.0185805701486...|0 0 0 0 0 0 0 0 0...|
|[-0.0038303944157...|1 0 0 0 0 0 0 0 0...|
|[-0.0099951453892...|1 0 0 0 0 0 0 0 0...|
|[-0.0030056643691...|0 0 0 0 0 0 0 0 0...|
|[-0.0152017962006...|1 0 1 0 0 0 0 0 0...|
|[0.03550026840631...|1 0 1 0 0 1 0 0 0...|
|[-0.0036234850048...|1 0 0 0 0 

In [137]:
#test_predictions.show(truncate = False)
#test_cv_df = test_cv_df.alias('df1')
#test_predictions = test_predictions.alias('df2')
#output_task_1=test_cv_df.join(test_predictions,test_cv_df.features == test_predictions.features).select('df1.movie_id','df2.predictions')
test_predictions = test_predictions.select('features','predictions').withColumnRenamed('features','rawfeatures')

In [138]:
test_cv_df.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|          plot_terms|            filtered|         rawfeatures|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 1335380|              Exodus|The film is based...|[the, film, is, b...|[film, based, eve...|[4.60732558957860...|
|29062594|A la salida nos v...|A group of teenag...|[a, group, of, te...|[group, teenagers...|[-0.0281006262745...|
| 9252321|   Come Back, Africa|This story of a Z...|[this, story, of,...|[story, zulu, fam...|[-0.0033151158445...|
|13455076|       A Merry Mixup|The Stooges play ...|[the, stooges, pl...|[stooges, play, t...|[-0.0251178558541...|
|24165951|        Getting Even|A soldier-of-fort...|[a, soldier, of, ...|[soldier, fortune...|[0.02676017268095...|
| 1925869|  River of No Return|Set in the Northw...|[set, in, the, no...

In [139]:
output_intermediate = test_cv_df.join(test_predictions, on = ['rawfeatures'],how = 'inner')

In [140]:
output_intermediate.show()

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         rawfeatures|movie_id|          movie_name|                plot|          plot_terms|            filtered|         predictions|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[-0.0325246172348...|18333832|   The Last Mistress|Ryno de Marigny ,...|[ryno, de, marign...|[ryno, de, marign...|1 0 1 0 0 0 0 0 0...|
|[-0.0325076468661...|25714947|           Bolo Raam|Bolo Raam follows...|[bolo, raam, foll...|[bolo, raam, foll...|1 0 0 1 0 0 0 0 0...|
|[-0.0264306829225...| 3676302|          Tokyo Fist|The film tells ab...|[the, film, tells...|[film, tells, jap...|1 0 0 0 0 0 0 0 0...|
|[-0.0255226221503...| 5217643|            Mugavari|Sridhar  has been...|[sridhar, has, be...|[sridhar, trying,...|1 0 0 0 0 0 0 0 0...|
|[-0.0227838564441...|25305145|         L

In [142]:
output_intermediate = output_intermediate.distinct()
output_intermediate.count()

7777

In [143]:
output_task_3 = output_intermediate.select('movie_id','predictions')

In [144]:
output_task_3.toPandas().to_csv("task_3.csv")