# Library


In [0]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

In [0]:
#sc=SparkContext() #Uncomment this line if you're creating a SparkContext() for the first time
sql=SQLContext(sc)
ss=SparkSession(sc)



# Rename

In [0]:
dbutils.fs.cp('/FileStore/tables/small_csv.gz', '/FileStore/tables/small.csv.gz')
dbutils.fs.cp('/FileStore/tables/medium_csv.gz', '/FileStore/tables/medium.csv.gz')
dbutils.fs.cp('/FileStore/tables/large_csv.gz', '/FileStore/tables/large.csv.gz')

Out[3]: True

# Question 1

In [0]:
small=sql.read.csv('/FileStore/tables/small.csv.gz', header=True)
medium=sql.read.csv('/FileStore/tables/medium.csv.gz', header=True)
large=sql.read.csv('/FileStore/tables/large.csv.gz', header=True)

In [0]:
small.registerTempTable('small')
medium.registerTempTable('medium')
large.registerTempTable('large')



## Answers

### Small

In [0]:
sql.sql('select count(distinct sentence) small from small').show()

+-----+
|small|
+-----+
|24803|
+-----+



### Medium

In [0]:
sql.sql('select count(distinct sentence) medium from medium').show()

+------+
|medium|
+------+
| 98504|
+------+



### Large

In [0]:
sql.sql('select count(distinct sentence) large from large').show()

+------+
| large|
+------+
|389639|
+------+



# Question 2

In [0]:
small_sentence=sql.sql('select sentence from small')
medium_sentence=sql.sql('select sentence from medium')
large_sentence=sql.sql('select sentence from large')

In [0]:
#Delete the "mode='overwrite'" if you're writing for the first time
small_sentence.write.csv('/FileStore/tables/small_sentence', mode='overwrite')
medium_sentence.write.csv('/FileStore/tables/medium_sentence', mode='overwrite')
large_sentence.write.csv('/FileStore/tables/large_sentence', mode='overwrite')

In [0]:
small_sentence=sc.textFile('/FileStore/tables/small_sentence')
medium_sentence=sc.textFile('/FileStore/tables/medium_sentence')
large_sentence=sc.textFile('/FileStore/tables/large_sentence')

## Answers

### Small

In [0]:
small_sentence.map(lambda x: (len(x.split(' ')), x)).sortBy(lambda x: x[0], ascending=False).map(lambda x: x[0]).take(10)

Out[12]: [562, 160, 150, 114, 109, 102, 100, 95, 93, 91]

### Medium

In [0]:
medium_sentence.map(lambda x: (len(x.split(' ')), x)).sortBy(lambda x: x[0], ascending=False).map(lambda x: x[0]).take(10)

Out[13]: [2499, 381, 278, 243, 221, 206, 192, 190, 186, 186]

### Large

In [0]:
large_sentence.map(lambda x: (len(x.split(' ')), x)).sortBy(lambda x: x[0], ascending=False).map(lambda x: x[0]).take(10)

Out[14]: [4571, 2499, 562, 528, 426, 413, 382, 381, 348, 335]

# Question 3

In [0]:
average_number_bigrams_small_sentence=small_sentence.map(lambda x: len(x.split(' '))-1).map(lambda x: [x, 1]).reduce(lambda x, y: [x[0]+y[0], x[1]+y[1]])
average_number_bigrams_medium_sentence=medium_sentence.map(lambda x: len(x.split(' '))-1).map(lambda x: [x, 1]).reduce(lambda x, y: [x[0]+y[0], x[1]+y[1]])
average_number_bigrams_large_sentence=large_sentence.map(lambda x: len(x.split(' '))-1).map(lambda x: [x, 1]).reduce(lambda x, y: [x[0]+y[0], x[1]+y[1]])

## Answers

### Small

In [0]:
average_number_bigrams_small_sentence[0]/average_number_bigrams_small_sentence[1]

Out[16]: 18.09192

### Medium

In [0]:
average_number_bigrams_medium_sentence[0]/average_number_bigrams_medium_sentence[1]

Out[17]: 18.05113

### Large

In [0]:
average_number_bigrams_large_sentence[0]/average_number_bigrams_large_sentence[1]

Out[18]: 18.036295

# Question 4

In [0]:
small_sentence=small_sentence.map(lambda x: x.split(' '))
medium_sentence=medium_sentence.map(lambda x: x.split(' '))
large_sentence=large_sentence.map(lambda x: x.split(' '))

In [0]:
def concatenate_word(sentence):
    my_list=[]
    for i in range(0, len(sentence)-1):
        j=i+1
        my_list.append(sentence[i]+' '+sentence[j])
    return my_list

In [0]:
small_sentence=small_sentence.flatMap(concatenate_word)
medium_sentence=medium_sentence.flatMap(concatenate_word)
large_sentence=large_sentence.flatMap(concatenate_word)

## Answers

### Small

In [0]:
small_sentence.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).sortBy(lambda x: x[1], ascending=False).take(10)

Out[22]: [('of the', 4801),
 ('in the', 3409),
 ('to the', 1549),
 ('at the', 1342),
 ('is a', 1211),
 ('for the', 1124),
 ('and the', 954),
 ('on the', 952),
 ('as a', 911),
 ('with the', 774)]

### Medium

In [0]:
medium_sentence.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).sortBy(lambda x: x[1], ascending=False).take(10)

Out[23]: [('of the', 19033),
 ('in the', 13524),
 ('to the', 6565),
 ('at the', 5327),
 ('is a', 5024),
 ('for the', 4510),
 ('and the', 4083),
 ('on the', 3973),
 ('as a', 3290),
 ('by the', 2982)]

### Large

In [0]:
large_sentence.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).sortBy(lambda x: x[1], ascending=False).take(10)

Out[24]: [('of the', 76290),
 ('in the', 54056),
 ('to the', 25484),
 ('at the', 21588),
 ('is a', 19261),
 ('for the', 17942),
 ('on the', 16049),
 ('and the', 15822),
 ('as a', 13240),
 ('with the', 11928)]

# Question 5

In [0]:
magpie=ss.read.json('/FileStore/tables/MAGPIE_unfiltered.jsonl')

In [0]:
small_sentence.map(lambda x: [x]).toDF(['bigrams']).registerTempTable('small_bigrams')
medium_sentence.map(lambda x: [x]).toDF(['bigrams']).registerTempTable('medium_bigrams')
large_sentence.map(lambda x: [x]).toDF(['bigrams']).registerTempTable('large_bigrams')
magpie.registerTempTable('magpie')



## Answers


### Small

In [0]:
sql.sql('select count(distinct bigrams) small from small_bigrams where bigrams in (select idiom from magpie)').show()

+-----+
|small|
+-----+
|   11|
+-----+



### Medium

In [0]:
sql.sql('select count(distinct bigrams) medium from medium_bigrams where bigrams in (select idiom from magpie)').show()

+------+
|medium|
+------+
|    27|
+------+



### Large

In [0]:
sql.sql('select count(distinct bigrams) large from large_bigrams where bigrams in (select idiom from magpie)').show()

+-----+
|large|
+-----+
|   67|
+-----+



# Question 6


In [0]:
small_sentence.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).toDF(['bigrams', 'frequency']).registerTempTable('small_bigrams_frequency')
medium_sentence.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).toDF(['bigrams', 'frequency']).registerTempTable('medium_bigrams_frequency')
large_sentence.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).toDF(['bigrams', 'frequency']).registerTempTable('large_bigrams_frequency')

## Answers


### Small

In [0]:
sql.sql('select * from (select *, dense_rank() over(order by frequency desc, bigrams asc) rank from small_bigrams_frequency where bigrams not in (select idiom from magpie)) where rank between 2500 and 2510').show()

+------------+---------+----+
|     bigrams|frequency|rank|
+------------+---------+----+
|      to him|       12|2500|
| to identify|       12|2501|
| to increase|       12|2502|
|  to operate|       12|2503|
|townland was|       12|2504|
|  two events|       12|2505|
|      up and|       12|2506|
|       up by|       12|2507|
|      was as|       12|2508|
| was carried|       12|2509|
|was composed|       12|2510|
+------------+---------+----+



### Medium

In [0]:
sql.sql('select * from (select *, dense_rank() over(order by frequency desc, bigrams asc) rank from medium_bigrams_frequency where bigrams not in (select idiom from magpie)) where rank between 2500 and 2510').show()

+-----------------+---------+----+
|          bigrams|frequency|rank|
+-----------------+---------+----+
|        without a|       45|2500|
|       writer and|       45|2501|
|    "Although the|       44|2502|
|     "During this|       44|2503|
|- \"\"Griffithsia|       44|2504|
|         10 March|       44|2505|
|         1994) is|       44|2506|
|        2018) was|       44|2507|
|  Deputy Minister|       44|2508|
|       Europe and|       44|2509|
|        Museum in|       44|2510|
+-----------------+---------+----+



### Large

In [0]:
sql.sql('select * from (select *, dense_rank() over(order by frequency desc, bigrams asc) rank from large_bigrams_frequency where bigrams not in (select idiom from magpie)) where rank between 2500 and 2510').show()

+----------------+---------+----+
|         bigrams|frequency|rank|
+----------------+---------+----+
|         2018 to|      174|2500|
| Communist Party|      174|2501|
|Cup competition.|      174|2502|
|      During the|      174|2503|
|       a meeting|      174|2504|
|         a state|      174|2505|
|      a two-year|      174|2506|
|           at St|      174|2507|
|  been described|      174|2508|
|       bishop of|      174|2509|
|        chose to|      174|2510|
+----------------+---------+----+



In [0]:
sql.sql('select * from (select *, dense_rank() over(order by frequency desc, bigrams asc) rank from small_bigrams_frequency where bigrams not in (select idiom from magpie)) where bigrams="the civil"').show()

+---------+---------+----+
|  bigrams|frequency|rank|
+---------+---------+----+
|the civil|       12|2470|
+---------+---------+----+



In [0]:
sql.sql('select * from (select *, dense_rank() over(order by frequency desc, bigrams asc) rank from medium_bigrams_frequency where bigrams not in (select idiom from magpie)) where bigrams="males and"').show()

+---------+---------+----+
|  bigrams|frequency|rank|
+---------+---------+----+
|males and|       45|2474|
+---------+---------+----+

