In [2]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.master("local[*]").appName("MyApp").getOrCreate()

In [6]:
sc = spark.sparkContext

In [9]:
df = spark.read.parquet("/content/dataPARQUET.parquet")

In [10]:
df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [14]:
# Alternativa, importado la funcion col
from pyspark.sql.functions import col
df.select(col('title')).show()

+--------------------+
|               title|
+--------------------+
|WE WANT TO TALK A...|
|The Trump Preside...|
|Racist Superman |...|
|Nickelback Lyrics...|
|I Dare You: GOING...|
|2 Weeks with iPho...|
|Roy Moore & Jeff ...|
|5 Ice Cream Gadge...|
|The Greatest Show...|
|Why the rise of t...|
|Dion Lewis' 103-Y...|
|(SPOILERS) 'Shiva...|
|Marshmello - Bloc...|
|Which Countries A...|
|SHOPPING FOR NEW ...|
|    The New SpotMini|
|One Change That W...|
|How does your bod...|
|HomeMade Electric...|
|Founding An Inbre...|
+--------------------+
only showing top 20 rows



<H1>TRANSFORMACIONES

## Select y SelectExpr

In [17]:
df.select(col('title'), col('channel_title')).show()

+--------------------+--------------------+
|               title|       channel_title|
+--------------------+--------------------+
|WE WANT TO TALK A...|        CaseyNeistat|
|The Trump Preside...|     LastWeekTonight|
|Racist Superman |...|        Rudy Mancuso|
|Nickelback Lyrics...|Good Mythical Mor...|
|I Dare You: GOING...|            nigahiga|
|2 Weeks with iPho...|            iJustine|
|Roy Moore & Jeff ...| Saturday Night Live|
|5 Ice Cream Gadge...|  CrazyRussianHacker|
|The Greatest Show...|    20th Century Fox|
|Why the rise of t...|                 Vox|
|Dion Lewis' 103-Y...|                 NFL|
|(SPOILERS) 'Shiva...|                 amc|
|Marshmello - Bloc...|          marshmello|
|Which Countries A...|       NowThis World|
|SHOPPING FOR NEW ...|     The king of DIY|
|    The New SpotMini|      BostonDynamics|
|One Change That W...|             Cracked|
|How does your bod...|              TED-Ed|
|HomeMade Electric...|         PeterSripol|
|Founding An Inbre...|          

In [22]:
df.select(
    'likes',
    'dislikes',
    'views',
    (col('likes') - col('dislikes')).alias('acceptance')
).show()

+------+--------+-------+----------+
| likes|dislikes|  views|acceptance|
+------+--------+-------+----------+
| 57527|    2966| 748374|   54561.0|
| 97185|    6146|2418783|   91039.0|
|146033|    5339|3191434|  140694.0|
| 10172|     666| 343168|    9506.0|
|132235|    1989|2095731|  130246.0|
|  9763|     511| 119180|    9252.0|
| 15993|    2445|2103417|   13548.0|
| 23663|     778| 817732|   22885.0|
|  3543|     119| 826059|    3424.0|
| 12654|    1363| 256426|   11291.0|
|   655|      25|  81377|     630.0|
|  1576|     303| 104578|    1273.0|
|114188|    1333| 687582|  112855.0|
|  7848|    1171| 544770|    6677.0|
|  7473|     246| 207532|    7227.0|
|  9419|      52|  75752|    9367.0|
|  8011|     638| 295639|    7373.0|
|  5398|      53|  78044|    5345.0|
| 11963|      36|  97007|   11927.0|
|  8421|     191| 223871|    8230.0|
+------+--------+-------+----------+
only showing top 20 rows



In [23]:
df.selectExpr(
    'likes',
    'dislikes',
    'views',
    '(likes - dislikes) as acceptance'
).show()

+------+--------+-------+----------+
| likes|dislikes|  views|acceptance|
+------+--------+-------+----------+
| 57527|    2966| 748374|   54561.0|
| 97185|    6146|2418783|   91039.0|
|146033|    5339|3191434|  140694.0|
| 10172|     666| 343168|    9506.0|
|132235|    1989|2095731|  130246.0|
|  9763|     511| 119180|    9252.0|
| 15993|    2445|2103417|   13548.0|
| 23663|     778| 817732|   22885.0|
|  3543|     119| 826059|    3424.0|
| 12654|    1363| 256426|   11291.0|
|   655|      25|  81377|     630.0|
|  1576|     303| 104578|    1273.0|
|114188|    1333| 687582|  112855.0|
|  7848|    1171| 544770|    6677.0|
|  7473|     246| 207532|    7227.0|
|  9419|      52|  75752|    9367.0|
|  8011|     638| 295639|    7373.0|
|  5398|      53|  78044|    5345.0|
| 11963|      36|  97007|   11927.0|
|  8421|     191| 223871|    8230.0|
+------+--------+-------+----------+
only showing top 20 rows



In [24]:
df.selectExpr('count(distinct(video_id)) as videos').show()

+------+
|videos|
+------+
|  6837|
+------+



## Filter y Where

In [27]:
df.filter(col('likes') < 100).select('likes','title').show()

+-----+--------------------+
|likes|               title|
+-----+--------------------+
|    7|Dennis Smith Jr. ...|
|   35|Heidelberg's nift...|
|   21|Kellyanne Conway ...|
|   89|Train Swipes Park...|
|   52|7.3 Magnitude Ear...|
|   67|Some NFL players ...|
|   49|The Oak Beams of ...|
|    2|Huffy Metaloid Bi...|
|   22|Caterham Chris Ho...|
|   40|Basmati Blues - T...|
|    3|Sphaera - demonst...|
|    0|  Apple Clips sample|
|   57|World's first sel...|
|   62|Does Gwen Stefani...|
|   61|Amazon CEO Jeff B...|
|   75|Taylor Swift Inst...|
|   87|Willie Reed and J...|
|   83|Dennis Smith Jr. ...|
|   33|Joel Embiid on wi...|
|   23|Kellyanne Conway ...|
+-----+--------------------+
only showing top 20 rows



In [31]:
 df1 = spark.read.parquet("/content/dataPARQUET.parquet").where(col('likes') < 100).select('likes','title')

In [33]:
df1.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------+
|likes|title                                                                                               |
+-----+----------------------------------------------------------------------------------------------------+
|7    |Dennis Smith Jr. and LeBron James go back and forth                                                 |
|35   |Heidelberg's nifty hook-and-lateral to the left tackle                                              |
|21   |Kellyanne Conway on Roy Moore This Week Abc: Trump’s Not Being Briefed On This Bit By Bit 11/12/17  |
|89   |Train Swipes Parked Vehicle                                                                         |
|52   |7.3 Magnitude Earthquake Along Iraq-Iran Border Leaves At Least 400 Dead, 7,200 Injured | TIME      |
|67   |Some NFL players call for an end to Thursday night football after 8 players were injured last week  |
|49   |The Oak Beam

In [39]:
df_cero_likes = df1.filter((col('likes') > 50) & (col('likes') < 70))
df_cero_likes.show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------+
|likes|title                                                                                               |
+-----+----------------------------------------------------------------------------------------------------+
|52   |7.3 Magnitude Earthquake Along Iraq-Iran Border Leaves At Least 400 Dead, 7,200 Injured | TIME      |
|67   |Some NFL players call for an end to Thursday night football after 8 players were injured last week  |
|57   |World's first self-driving shuttle crashes on first day of use                                      |
|62   |Does Gwen Stefani Know If Blake Shelton Is People's Sexiest Man Alive? | Access Hollywood           |
|61   |Amazon CEO Jeff Bezos and brother Mark give a rare interview about growing up and secrets to success|
|54   |Joel Embiid on win over Clippers, scuffle with Willie Reed and using social media | NBA on ESPN     |
|58   |Joel Embiid 

## Funciones distinct y dropDuplicates

In [42]:
df_sin_duplicados = df.distinct()

In [43]:
print("El conteo del dataframe original es {}".format(df.count()))
print("El conteo del dataframe sin duplicados es {}".format(df_sin_duplicados.count()))

El conteo del dataframe original es 48137
El conteo del dataframe sin duplicados es 41497


DropDuplicates a diferencia de distinct, permite indicar que columnas se consideran para establecer que es duplicado.

In [44]:
## Funcion drop duplicates
df_no_duplicados = df.dropDuplicates(['title'])

In [45]:
print("El conteo del dataframe original es {}".format(df.count()))
print("El conteo del dataframe sin duplicados es {}".format(df_no_duplicados.count()))

El conteo del dataframe original es 48137
El conteo del dataframe sin duplicados es 6508


## Sort y order by

In [46]:
df = (spark.read.parquet("/content/dataPARQUET.parquet")
.select('likes','views','video_id','dislikes')
.dropDuplicates(['video_id'])
      )

In [47]:
df.show()

+-----+-------+-----------+--------+
|likes|  views|   video_id|dislikes|
+-----+-------+-----------+--------+
| 6531| 475965|-0CMnp02rNY|     172|
| 4429| 563746|-0NYY8cqdiQ|      54|
|32752|1566807|-1Hm41N0dUs|     393|
| 5214| 129360|-1yT-K3c6YI|     108|
|  438|  67429|-2RVw2_QyxQ|      23|
|19339|1012527|-2aVkGcI7ZA|     633|
| 1444|  84744|-2b4qSoMnKE|     199|
|10350| 703371|-2wRFv-mScQ|     260|
|73480| 545655|-35jibKqbEo|     727|
|    2|   2863|-37nIo_tLnk|       0|
| 4028| 385104|-39ysKKpE7I|     343|
| 6468| 230360|-3h4Xt9No9o|     177|
|10384| 249601|-3nEHRN6IPg|     370|
|38776| 296237|-4s2MeUgduo|     466|
|71090| 390631|-5aaJJQFvOg|     635|
|21224| 744363|-66xHRJSPxs|     534|
|17882| 363370|-7AZX5Xtiks|     416|
|36960| 908989|-7UzyXO-mzk|     434|
|17120|1815030|-7_ATlZ-zMc|     633|
|  760| 252542|-8ZHXaGILlc|     100|
+-----+-------+-----------+--------+
only showing top 20 rows



In [48]:
df.sort('likes').show()

+-----+-----+--------------------+--------+
|likes|views|            video_id|dislikes|
+-----+-----+--------------------+--------+
| NULL| NULL|Awesome Games Pla...|    NULL|
| NULL| NULL|Filmed by Lucas F...|    NULL|
| NULL| NULL|    Beautiful Thing:|    NULL|
| NULL| NULL|Bon Appétit Test ...|    NULL|
| NULL| NULL|Filmed at the Bee...|    NULL|
| NULL| NULL|Britton Lane: htt...|    NULL|
| NULL| NULL|Allie Sherlock: h...|    NULL|
| NULL| NULL|Browse thousands ...|    NULL|
| NULL| NULL|   ABOUT BON APPÉTIT|    NULL|
| NULL| NULL|Catch Terry Crews...|    NULL|
| NULL| NULL|    ABOUT EPICURIOUS|    NULL|
| NULL| NULL|Check Out My WEBS...|    NULL|
| NULL| NULL|    ABOUT TEEN VOGUE|    NULL|
| NULL| NULL|Check out the Dam...|    NULL|
| NULL| NULL|         ABOUT VOGUE|    NULL|
| NULL| NULL|          City Song:|    NULL|
| NULL| NULL|Filmed at the Wal...|    NULL|
| NULL| NULL|            Clearly:|    NULL|
| NULL| NULL|Black Panther is ...|    NULL|
| NULL| NULL|Cook with confide..

In [54]:
from pyspark.sql.functions import desc
df.sort(col('likes').desc()).show()

+-----+-------+-----------+--------+
|likes|  views|   video_id|dislikes|
+-----+-------+-----------+--------+
|99990|2079137|2v4-L4PkV9U|    2844|
|99973|2465294|DSRSgMp5X1w|   17299|
|99952|3313449|LdhQzXHYLZ4|    5142|
| 9991|  98513|eBnXbImHX-g|      91|
| 9988|1162843|kz1xzBYppW8|    2555|
|99851|1053828|vRf3azp1pak|    1226|
| 9984| 206669|Lydh_saD9EQ|      88|
| 9984| 254807|Ps7GzIV2KP0|     294|
|  998|  71308|Hkx5fveyjIs|      74|
|  998|  54348|Pr6zjrF0Djg|      75|
|  998|  82087|hX643KbiI4s|      93|
|99761|1454233|h5CLO2n6OxQ|     692|
|  997|  27234|nb42DxagyOE|      13|
| 9969| 273905|c47kn_Y4y8A|     127|
| 9946| 242329|QXcbVHFE2bo|     148|
| 9939| 235293|1iGBHh1q0Kg|     232|
| 9926| 467558|hHFuZVGpBC0|     342|
|99254|1552618|0v-6AylRH68|    5195|
| 9925| 166235|flLc6LmAG6c|      50|
| 9921| 594536|e9NOwaiXqqA|     323|
+-----+-------+-----------+--------+
only showing top 20 rows



In [58]:
df.orderBy(col('likes').desc(),col('dislikes').desc()).show()

+-----+-------+-----------+--------+
|likes|  views|   video_id|dislikes|
+-----+-------+-----------+--------+
|99990|2079137|2v4-L4PkV9U|    2844|
|99973|2465294|DSRSgMp5X1w|   17299|
|99952|3313449|LdhQzXHYLZ4|    5142|
| 9991|  98513|eBnXbImHX-g|      91|
| 9988|1162843|kz1xzBYppW8|    2555|
|99851|1053828|vRf3azp1pak|    1226|
| 9984| 206669|Lydh_saD9EQ|      88|
| 9984| 254807|Ps7GzIV2KP0|     294|
|  998|  82087|hX643KbiI4s|      93|
|  998|  54348|Pr6zjrF0Djg|      75|
|  998|  71308|Hkx5fveyjIs|      74|
|99761|1454233|h5CLO2n6OxQ|     692|
|  997|  27234|nb42DxagyOE|      13|
| 9969| 273905|c47kn_Y4y8A|     127|
| 9946| 242329|QXcbVHFE2bo|     148|
| 9939| 235293|1iGBHh1q0Kg|     232|
| 9926| 467558|hHFuZVGpBC0|     342|
|99254|1552618|0v-6AylRH68|    5195|
| 9925| 166235|flLc6LmAG6c|      50|
| 9921| 594536|e9NOwaiXqqA|     323|
+-----+-------+-----------+--------+
only showing top 20 rows



In [59]:
top_10_likes = df.orderBy(col('likes').desc()).limit(10)
top_10_likes.show()

+-----+-------+-----------+--------+
|likes|  views|   video_id|dislikes|
+-----+-------+-----------+--------+
|99990|2079137|2v4-L4PkV9U|    2844|
|99973|2465294|DSRSgMp5X1w|   17299|
|99952|3313449|LdhQzXHYLZ4|    5142|
| 9991|  98513|eBnXbImHX-g|      91|
| 9988|1162843|kz1xzBYppW8|    2555|
|99851|1053828|vRf3azp1pak|    1226|
| 9984| 206669|Lydh_saD9EQ|      88|
| 9984| 254807|Ps7GzIV2KP0|     294|
|  998|  71308|Hkx5fveyjIs|      74|
|  998|  54348|Pr6zjrF0Djg|      75|
+-----+-------+-----------+--------+



## Funciones WithColumn y withColumnRenamed

WithColumn sirve para crear columnas

In [60]:
df_valoracion = df.withColumn('valoracion', (col('likes') - col('dislikes')))

In [61]:
df.printSchema()
df_valoracion.printSchema()

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- dislikes: string (nullable = true)

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- valoracion: double (nullable = true)



In [73]:
df_valoracion1 = (df.withColumn('valoracion', (col('likes') - col('dislikes')))
                    .withColumn('% 10',col('valoracion') % 10)
)

In [74]:
df_valoracion1.printSchema()

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- valoracion: double (nullable = true)
 |-- % 10: double (nullable = true)



WithColumnRenamed, únicamente para cambiar nombre de columas de un dataframe

In [75]:
df_renombrado = df.withColumnRenamed('video_id','id_video')
df.printSchema()
df_renombrado.printSchema()

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- dislikes: string (nullable = true)

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- id_video: string (nullable = true)
 |-- dislikes: string (nullable = true)



No arroja error si una columna no existe

In [76]:
df_renombrado = df.withColumnRenamed('videonoexiste_id','id_video')
df.printSchema()
df_renombrado.printSchema()

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- dislikes: string (nullable = true)

root
 |-- likes: string (nullable = true)
 |-- views: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- dislikes: string (nullable = true)



## Drop , Sample y RandomSplit

In [77]:
df = spark.read.parquet("/content/dataPARQUET.parquet")
df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [78]:
df_util = df.drop('comments_disabled','ratings_disabled','video_error_or_removed','description')
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)



In [98]:
print(len(df_util.columns))

12


Sample

In [99]:
df_muestra = df.sample(0.8)
num_filas = df.count()
num_filas_muestra = df_muestra.count()

In [101]:
print("El 80% de filas del dataframe original es {}".format(num_filas - (num_filas*0.2)))

El 80% de filas del dataframe original es 38509.6


In [102]:
df_muestra = df.sample(fraction = 0.8, seed = 1234)

In [103]:
df_muestra = df.sample(withReplacement = True, fraction = 0.8, seed = 1234)

In [104]:
## Random Split
df_train, df_test = df.randomSplit([0.8,0.2], seed = 1234)

In [106]:
print("Las filas del dataframe train es {}".format(df_train.count()))
print(df_test.count())

Las filas del dataframe train es 38506
9631
